diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2023-07-26 19:03:47 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2023-07-26 19:04:23 +0000 |
| commit | 7fa27ce4a07f19b07799a767fc29416f3b625afb (patch) | |
| tree | 27825c83636c4de341eb09a74f49f5d38a15d165 /llvm/lib/Target/AMDGPU | |
| parent | e3b557809604d036af6e00c60f012c2025b59a5e (diff) | |
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
167 files changed, 16807 insertions, 10788 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index eaf72686c166..b82db82de84e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -26,6 +26,8 @@ FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone); FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone); void initializeAMDGPURegBankCombinerPass(PassRegistry &); +void initializeAMDGPURegBankSelectPass(PassRegistry &); + // SI Passes FunctionPass *createGCNDPPCombinePass(); FunctionPass *createSIAnnotateControlFlowPass(); @@ -39,6 +41,7 @@ FunctionPass *createSIFixControlFlowLiveIntervalsPass(); FunctionPass *createSIOptimizeExecMaskingPreRAPass(); FunctionPass *createSIOptimizeVGPRLiveRangePass(); FunctionPass *createSIFixSGPRCopiesPass(); +FunctionPass *createLowerWWMCopiesPass(); FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIInsertWaitcntsPass(); FunctionPass *createSIPreAllocateWWMRegsPass(); @@ -47,13 +50,11 @@ FunctionPass *createSIFormMemoryClausesPass(); FunctionPass *createSIPostRABundlerPass(); FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *); FunctionPass *createAMDGPUUseNativeCallsPass(); +ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *); FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPULateCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); -FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *); -ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *); FunctionPass *createAMDGPURewriteOutArgumentsPass(); -ModulePass *createAMDGPUReplaceLDSUseWithPointerPass(); ModulePass *createAMDGPULowerModuleLDSPass(); FunctionPass *createSIModeRegisterPass(); FunctionPass *createGCNPreRAOptimizationsPass(); @@ -83,14 +84,13 @@ void initializeAMDGPUAttributorPass(PassRegistry &); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; -FunctionPass *createAMDGPUAtomicOptimizerPass(); +// DPP/Iterative option enables the atomic optimizer with given strategy +// whereas None disables the atomic optimizer. +enum class ScanOptions { DPP, Iterative, None }; +FunctionPass *createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy); void initializeAMDGPUAtomicOptimizerPass(PassRegistry &); extern char &AMDGPUAtomicOptimizerID; -ModulePass *createAMDGPULowerIntrinsicsPass(); -void initializeAMDGPULowerIntrinsicsPass(PassRegistry &); -extern char &AMDGPULowerIntrinsicsID; - ModulePass *createAMDGPUCtorDtorLoweringLegacyPass(); void initializeAMDGPUCtorDtorLoweringLegacyPass(PassRegistry &); extern char &AMDGPUCtorDtorLoweringLegacyPassID; @@ -117,38 +117,6 @@ struct AMDGPULowerKernelAttributesPass PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; -void initializeAMDGPUPropagateAttributesEarlyPass(PassRegistry &); -extern char &AMDGPUPropagateAttributesEarlyID; - -struct AMDGPUPropagateAttributesEarlyPass - : PassInfoMixin<AMDGPUPropagateAttributesEarlyPass> { - AMDGPUPropagateAttributesEarlyPass(TargetMachine &TM) : TM(TM) {} - PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); - -private: - TargetMachine &TM; -}; - -void initializeAMDGPUPropagateAttributesLatePass(PassRegistry &); -extern char &AMDGPUPropagateAttributesLateID; - -struct AMDGPUPropagateAttributesLatePass - : PassInfoMixin<AMDGPUPropagateAttributesLatePass> { - AMDGPUPropagateAttributesLatePass(TargetMachine &TM) : TM(TM) {} - PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); - -private: - TargetMachine &TM; -}; - -void initializeAMDGPUReplaceLDSUseWithPointerPass(PassRegistry &); -extern char &AMDGPUReplaceLDSUseWithPointerID; - -struct AMDGPUReplaceLDSUseWithPointerPass - : PassInfoMixin<AMDGPUReplaceLDSUseWithPointerPass> { - PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); -}; - void initializeAMDGPULowerModuleLDSPass(PassRegistry &); extern char &AMDGPULowerModuleLDSID; @@ -177,6 +145,9 @@ extern char &SIFixSGPRCopiesID; void initializeSIFixVGPRCopiesPass(PassRegistry &); extern char &SIFixVGPRCopiesID; +void initializeSILowerWWMCopiesPass(PassRegistry &); +extern char &SILowerWWMCopiesID; + void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; @@ -239,6 +210,16 @@ private: TargetMachine &TM; }; +struct AMDGPUAtomicOptimizerPass : PassInfoMixin<AMDGPUAtomicOptimizerPass> { + AMDGPUAtomicOptimizerPass(TargetMachine &TM, ScanOptions ScanImpl) + : TM(TM), ScanImpl(ScanImpl) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + +private: + TargetMachine &TM; + ScanOptions ScanImpl; +}; + Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag(TargetMachine &TM, CodeGenOpt::Level OptLevel); @@ -252,6 +233,16 @@ private: bool GlobalOpt; }; +class AMDGPUCodeGenPreparePass + : public PassInfoMixin<AMDGPUCodeGenPreparePass> { +private: + TargetMachine &TM; + +public: + AMDGPUCodeGenPreparePass(TargetMachine &TM) : TM(TM){}; + PreservedAnalyses run(Function &, FunctionAnalysisManager &); +}; + FunctionPass *createAMDGPUAnnotateUniformValues(); ModulePass *createAMDGPUPrintfRuntimeBinding(); @@ -286,6 +277,9 @@ extern char &AMDGPUAnnotateUniformValuesPassID; void initializeAMDGPUCodeGenPreparePass(PassRegistry&); extern char &AMDGPUCodeGenPrepareID; +void initializeAMDGPURemoveIncompatibleFunctionsPass(PassRegistry &); +extern char &AMDGPURemoveIncompatibleFunctionsID; + void initializeAMDGPULateCodeGenPreparePass(PassRegistry &); extern char &AMDGPULateCodeGenPrepareID; @@ -302,9 +296,6 @@ extern char &SIMemoryLegalizerID; void initializeSIModeRegisterPass(PassRegistry&); extern char &SIModeRegisterID; -void initializeAMDGPUReleaseVGPRsPass(PassRegistry &); -extern char &AMDGPUReleaseVGPRsID; - void initializeAMDGPUInsertDelayAluPass(PassRegistry &); extern char &AMDGPUInsertDelayAluID; @@ -340,12 +331,18 @@ extern char &AMDGPUOpenCLEnqueuedBlockLoweringID; void initializeGCNNSAReassignPass(PassRegistry &); extern char &GCNNSAReassignID; +void initializeGCNPreRALongBranchRegPass(PassRegistry &); +extern char &GCNPreRALongBranchRegID; + void initializeGCNPreRAOptimizationsPass(PassRegistry &); extern char &GCNPreRAOptimizationsID; FunctionPass *createAMDGPUSetWavePriorityPass(); void initializeAMDGPUSetWavePriorityPass(PassRegistry &); +void initializeGCNRewritePartialRegUsesPass(llvm::PassRegistry &); +extern char &GCNRewritePartialRegUsesID; + namespace AMDGPU { enum TargetIndex { TI_CONSTDATA_START, @@ -363,53 +360,60 @@ enum TargetIndex { /// a separate piece of memory that is unique from other /// memory locations. namespace AMDGPUAS { - enum : unsigned { - // The maximum value for flat, generic, local, private, constant and region. - MAX_AMDGPU_ADDRESS = 7, +enum : unsigned { + // The maximum value for flat, generic, local, private, constant and region. + MAX_AMDGPU_ADDRESS = 8, - FLAT_ADDRESS = 0, ///< Address space for flat memory. - GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). - REGION_ADDRESS = 2, ///< Address space for region memory. (GDS) + FLAT_ADDRESS = 0, ///< Address space for flat memory. + GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). + REGION_ADDRESS = 2, ///< Address space for region memory. (GDS) - CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2). - LOCAL_ADDRESS = 3, ///< Address space for local memory. - PRIVATE_ADDRESS = 5, ///< Address space for private memory. + CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2). + LOCAL_ADDRESS = 3, ///< Address space for local memory. + PRIVATE_ADDRESS = 5, ///< Address space for private memory. - CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory. + CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory. - BUFFER_FAT_POINTER = 7, ///< Address space for 160-bit buffer fat pointers. + BUFFER_FAT_POINTER = 7, ///< Address space for 160-bit buffer fat pointers. + ///< Not used in backend. - /// Address space for direct addressable parameter memory (CONST0). - PARAM_D_ADDRESS = 6, - /// Address space for indirect addressable parameter memory (VTX1). - PARAM_I_ADDRESS = 7, + BUFFER_RESOURCE = 8, ///< Address space for 128-bit buffer resources. - // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on - // this order to be able to dynamically index a constant buffer, for - // example: - // - // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx + /// Internal address spaces. Can be freely renumbered. + STREAMOUT_REGISTER = 128, ///< Address space for GS NGG Streamout registers. + /// end Internal address spaces. - CONSTANT_BUFFER_0 = 8, - CONSTANT_BUFFER_1 = 9, - CONSTANT_BUFFER_2 = 10, - CONSTANT_BUFFER_3 = 11, - CONSTANT_BUFFER_4 = 12, - CONSTANT_BUFFER_5 = 13, - CONSTANT_BUFFER_6 = 14, - CONSTANT_BUFFER_7 = 15, - CONSTANT_BUFFER_8 = 16, - CONSTANT_BUFFER_9 = 17, - CONSTANT_BUFFER_10 = 18, - CONSTANT_BUFFER_11 = 19, - CONSTANT_BUFFER_12 = 20, - CONSTANT_BUFFER_13 = 21, - CONSTANT_BUFFER_14 = 22, - CONSTANT_BUFFER_15 = 23, + /// Address space for direct addressable parameter memory (CONST0). + PARAM_D_ADDRESS = 6, + /// Address space for indirect addressable parameter memory (VTX1). + PARAM_I_ADDRESS = 7, - // Some places use this if the address space can't be determined. - UNKNOWN_ADDRESS_SPACE = ~0u, - }; + // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on + // this order to be able to dynamically index a constant buffer, for + // example: + // + // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx + + CONSTANT_BUFFER_0 = 8, + CONSTANT_BUFFER_1 = 9, + CONSTANT_BUFFER_2 = 10, + CONSTANT_BUFFER_3 = 11, + CONSTANT_BUFFER_4 = 12, + CONSTANT_BUFFER_5 = 13, + CONSTANT_BUFFER_6 = 14, + CONSTANT_BUFFER_7 = 15, + CONSTANT_BUFFER_8 = 16, + CONSTANT_BUFFER_9 = 17, + CONSTANT_BUFFER_10 = 18, + CONSTANT_BUFFER_11 = 19, + CONSTANT_BUFFER_12 = 20, + CONSTANT_BUFFER_13 = 21, + CONSTANT_BUFFER_14 = 22, + CONSTANT_BUFFER_15 = 23, + + // Some places use this if the address space can't be determined. + UNKNOWN_ADDRESS_SPACE = ~0u, +}; } namespace AMDGPU { @@ -421,6 +425,38 @@ inline bool isFlatGlobalAddrSpace(unsigned AS) { AS == AMDGPUAS::CONSTANT_ADDRESS || AS > AMDGPUAS::MAX_AMDGPU_ADDRESS; } + +inline bool isExtendedGlobalAddrSpace(unsigned AS) { + return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || + AS > AMDGPUAS::MAX_AMDGPU_ADDRESS; +} + +static inline bool addrspacesMayAlias(unsigned AS1, unsigned AS2) { + static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 8, "Addr space out of range"); + + if (AS1 > AMDGPUAS::MAX_AMDGPU_ADDRESS || AS2 > AMDGPUAS::MAX_AMDGPU_ADDRESS) + return true; + + // This array is indexed by address space value enum elements 0 ... to 8 + // clang-format off + static const bool ASAliasRules[9][9] = { + /* Flat Global Region Group Constant Private Const32 BufFatPtr BufRsrc */ + /* Flat */ {true, true, false, true, true, true, true, true, true}, + /* Global */ {true, true, false, false, true, false, true, true, true}, + /* Region */ {false, false, true, false, false, false, false, false, false}, + /* Group */ {true, false, false, true, false, false, false, false, false}, + /* Constant */ {true, true, false, false, false, false, true, true, true}, + /* Private */ {true, false, false, false, false, true, false, false, false}, + /* Constant 32-bit */ {true, true, false, false, true, false, false, true, true}, + /* Buffer Fat Ptr */ {true, true, false, false, true, false, true, true, true}, + /* Buffer Resource */ {true, true, false, false, true, false, true, true, true}, + }; + // clang-format on + + return ASAliasRules[AS1][AS2]; +} + } } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index ddc32988881a..b178623a319d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -18,10 +18,6 @@ def p4 : PtrValueType<i64, 4>; def p5 : PtrValueType<i32, 5>; def p6 : PtrValueType<i32, 6>; -class BoolToList<bit Value> { - list<int> ret = !if(Value, [1]<int>, []<int>); -} - //===------------------------------------------------------------===// // Subtarget Features (device properties) //===------------------------------------------------------------===// @@ -494,6 +490,12 @@ def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding", "Support NSA encoding for image instructions" >; +def FeaturePartialNSAEncoding : SubtargetFeature<"partial-nsa-encoding", + "HasPartialNSAEncoding", + "true", + "Support partial NSA encoding for image instructions" +>; + def FeatureImageInsts : SubtargetFeature<"image-insts", "HasImageInsts", "true", @@ -581,7 +583,7 @@ def FeatureDot6Insts : SubtargetFeature<"dot6-insts", def FeatureDot7Insts : SubtargetFeature<"dot7-insts", "HasDot7Insts", "true", - "Has v_dot2_f32_f16, v_dot4_u32_u8, v_dot8_u32_u4 instructions" + "Has v_dot4_u32_u8, v_dot8_u32_u4 instructions" >; def FeatureDot8Insts : SubtargetFeature<"dot8-insts", @@ -596,6 +598,12 @@ def FeatureDot9Insts : SubtargetFeature<"dot9-insts", "Has v_dot2_f16_f16, v_dot2_bf16_bf16, v_dot2_f32_bf16 instructions" >; +def FeatureDot10Insts : SubtargetFeature<"dot10-insts", + "HasDot10Insts", + "true", + "Has v_dot2_f32_f16 instruction" +>; + def FeatureMAIInsts : SubtargetFeature<"mai-insts", "HasMAIInsts", "true", @@ -614,6 +622,19 @@ def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst", "Has v_pk_fmac_f16 instruction" >; +def FeatureAtomicDsPkAdd16Insts : SubtargetFeature<"atomic-ds-pk-add-16-insts", + "HasAtomicDsPkAdd16Insts", + "true", + "Has ds_pk_add_bf16, ds_pk_add_f16, ds_pk_add_rtn_bf16, " + "ds_pk_add_rtn_f16 instructions" +>; + +def FeatureAtomicFlatPkAdd16Insts : SubtargetFeature<"atomic-flat-pk-add-16-insts", + "HasAtomicFlatPkAdd16Insts", + "true", + "Has flat_atomic_pk_add_f16 and flat_atomic_pk_add_bf16 instructions" +>; + def FeatureAtomicFaddRtnInsts : SubtargetFeature<"atomic-fadd-rtn-insts", "HasAtomicFaddRtnInsts", "true", @@ -630,15 +651,30 @@ def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts", [FeatureFlatGlobalInsts] >; -def FeatureAtomicPkFaddNoRtnInsts - : SubtargetFeature<"atomic-pk-fadd-no-rtn-insts", - "HasAtomicPkFaddNoRtnInsts", +def FeatureAtomicBufferGlobalPkAddF16NoRtnInsts + : SubtargetFeature<"atomic-buffer-global-pk-add-f16-no-rtn-insts", + "HasAtomicBufferGlobalPkAddF16NoRtnInsts", "true", "Has buffer_atomic_pk_add_f16 and global_atomic_pk_add_f16 instructions that " "don't return original value", [FeatureFlatGlobalInsts] >; +def FeatureAtomicBufferGlobalPkAddF16Insts : SubtargetFeature<"atomic-buffer-global-pk-add-f16-insts", + "HasAtomicBufferGlobalPkAddF16Insts", + "true", + "Has buffer_atomic_pk_add_f16 and global_atomic_pk_add_f16 instructions that " + "can return original value", + [FeatureFlatGlobalInsts] +>; + +def FeatureAtomicGlobalPkAddBF16Inst : SubtargetFeature<"atomic-global-pk-add-bf16-inst", + "HasAtomicGlobalPkAddBF16Inst", + "true", + "Has global_atomic_pk_add_bf16 instruction", + [FeatureFlatGlobalInsts] +>; + def FeatureFlatAtomicFaddF32Inst : SubtargetFeature<"flat-atomic-fadd-f32-inst", "HasFlatAtomicFaddF32Inst", @@ -718,15 +754,6 @@ def FeatureGFX11FullVGPRs : SubtargetFeature<"gfx11-full-vgprs", "GFX11 with 50% more physical VGPRs and 50% larger allocation granule than GFX10" >; -class SubtargetFeatureNSAMaxSize <int Value> : SubtargetFeature < - "nsa-max-size-"#Value, - "NSAMaxSize", - !cast<string>(Value), - "The maximum non-sequential address size in VGPRs." ->; - -def FeatureNSAMaxSize5 : SubtargetFeatureNSAMaxSize<5>; -def FeatureNSAMaxSize13 : SubtargetFeatureNSAMaxSize<13>; def FeatureVOPD : SubtargetFeature<"vopd", "HasVOPDInsts", @@ -740,6 +767,12 @@ def FeatureVALUTransUseHazard : SubtargetFeature<"valu-trans-use-hazard", "Hazard when TRANS instructions are closely followed by a use of the result" >; +def FeatureForceStoreSC0SC1 : SubtargetFeature<"force-store-sc0-sc1", + "HasForceStoreSC0SC1", + "true", + "Has SC0 and SC1 on stores" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -860,12 +893,20 @@ def FeatureArchitectedFlatScratch : SubtargetFeature<"architected-flat-scratch", "Flat Scratch register is a readonly SPI initialized architected register" >; +def FeatureArchitectedSGPRs : SubtargetFeature<"architected-sgprs", + "HasArchitectedSGPRs", + "true", + "Enable the architected SGPRs" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", "Dummy feature to disable assembler instructions" >; +//===----------------------------------------------------------------------===// + class GCNSubtargetFeatureGeneration <string Value, string FeatureName, list<SubtargetFeature> Implies> : @@ -962,6 +1003,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", ] >; +//===----------------------------------------------------------------------===// + class FeatureSet<list<SubtargetFeature> Features_> { list<SubtargetFeature> Features = Features_; } @@ -1006,30 +1049,28 @@ def FeatureISAVersion7_0_5 : FeatureSet< [FeatureSeaIslands, FeatureLDSBankCount16]>; -def FeatureISAVersion8_0_1 : FeatureSet< +def FeatureISAVersion8_0_Common : FeatureSet< [FeatureVolcanicIslands, - FeatureFastFMAF32, - HalfRate64Ops, FeatureLDSBankCount32, - FeatureSupportsXNACK, FeatureUnpackedD16VMem]>; +def FeatureISAVersion8_0_1 : FeatureSet< + !listconcat(FeatureISAVersion8_0_Common.Features, + [FeatureFastFMAF32, + HalfRate64Ops, + FeatureSupportsXNACK])>; + def FeatureISAVersion8_0_2 : FeatureSet< - [FeatureVolcanicIslands, - FeatureLDSBankCount32, - FeatureSGPRInitBug, - FeatureUnpackedD16VMem]>; + !listconcat(FeatureISAVersion8_0_Common.Features, + [FeatureSGPRInitBug])>; def FeatureISAVersion8_0_3 : FeatureSet< - [FeatureVolcanicIslands, - FeatureLDSBankCount32, - FeatureUnpackedD16VMem]>; + !listconcat(FeatureISAVersion8_0_Common.Features, + [])>; def FeatureISAVersion8_0_5 : FeatureSet< - [FeatureVolcanicIslands, - FeatureLDSBankCount32, - FeatureSGPRInitBug, - FeatureUnpackedD16VMem]>; + !listconcat(FeatureISAVersion8_0_Common.Features, + [FeatureSGPRInitBug])>; def FeatureISAVersion8_1_0 : FeatureSet< [FeatureVolcanicIslands, @@ -1038,126 +1079,101 @@ def FeatureISAVersion8_1_0 : FeatureSet< FeatureImageStoreD16Bug, FeatureImageGather4D16Bug]>; -def FeatureISAVersion9_0_0 : FeatureSet< +def FeatureISAVersion9_0_Common : FeatureSet< [FeatureGFX9, - FeatureMadMixInsts, FeatureLDSBankCount32, - FeatureDsSrc2Insts, - FeatureExtendedImageInsts, FeatureImageInsts, - FeatureMadMacF32Insts, - FeatureImageGather4D16Bug]>; + FeatureMadMacF32Insts]>; + +def FeatureISAVersion9_0_MI_Common : FeatureSet< + !listconcat(FeatureISAVersion9_0_Common.Features, + [FeatureFmaMixInsts, + FeatureDLInsts, + FeatureDot1Insts, + FeatureDot2Insts, + FeatureDot3Insts, + FeatureDot4Insts, + FeatureDot5Insts, + FeatureDot6Insts, + FeatureDot7Insts, + FeatureDot10Insts, + FeatureMAIInsts, + FeaturePkFmacF16Inst, + FeatureAtomicFaddNoRtnInsts, + FeatureSupportsSRAMECC])>; + +def FeatureISAVersion9_0_0 : FeatureSet< + !listconcat(FeatureISAVersion9_0_Common.Features, + [FeatureMadMixInsts, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureImageGather4D16Bug])>; def FeatureISAVersion9_0_2 : FeatureSet< - [FeatureGFX9, - FeatureMadMixInsts, - FeatureLDSBankCount32, - FeatureDsSrc2Insts, - FeatureExtendedImageInsts, - FeatureImageInsts, - FeatureMadMacF32Insts, - FeatureImageGather4D16Bug]>; + !listconcat(FeatureISAVersion9_0_Common.Features, + [FeatureMadMixInsts, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureImageGather4D16Bug])>; def FeatureISAVersion9_0_4 : FeatureSet< - [FeatureGFX9, - FeatureLDSBankCount32, - FeatureDsSrc2Insts, - FeatureExtendedImageInsts, - FeatureImageInsts, - FeatureMadMacF32Insts, - FeatureFmaMixInsts, - FeatureImageGather4D16Bug]>; + !listconcat(FeatureISAVersion9_0_Common.Features, + [FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureFmaMixInsts, + FeatureImageGather4D16Bug])>; def FeatureISAVersion9_0_6 : FeatureSet< - [FeatureGFX9, - HalfRate64Ops, - FeatureFmaMixInsts, - FeatureLDSBankCount32, - FeatureDsSrc2Insts, - FeatureExtendedImageInsts, - FeatureImageInsts, - FeatureMadMacF32Insts, - FeatureDLInsts, - FeatureDot1Insts, - FeatureDot2Insts, - FeatureDot7Insts, - FeatureSupportsSRAMECC, - FeatureImageGather4D16Bug]>; + !listconcat(FeatureISAVersion9_0_Common.Features, + [HalfRate64Ops, + FeatureFmaMixInsts, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureDLInsts, + FeatureDot1Insts, + FeatureDot2Insts, + FeatureDot7Insts, + FeatureDot10Insts, + FeatureSupportsSRAMECC, + FeatureImageGather4D16Bug])>; def FeatureISAVersion9_0_8 : FeatureSet< - [FeatureGFX9, - HalfRate64Ops, - FeatureFmaMixInsts, - FeatureLDSBankCount32, - FeatureDsSrc2Insts, - FeatureExtendedImageInsts, - FeatureImageInsts, - FeatureMadMacF32Insts, - FeatureDLInsts, - FeatureDot1Insts, - FeatureDot2Insts, - FeatureDot3Insts, - FeatureDot4Insts, - FeatureDot5Insts, - FeatureDot6Insts, - FeatureDot7Insts, - FeatureMAIInsts, - FeaturePkFmacF16Inst, - FeatureAtomicFaddNoRtnInsts, - FeatureAtomicPkFaddNoRtnInsts, - FeatureSupportsSRAMECC, - FeatureMFMAInlineLiteralBug, - FeatureImageGather4D16Bug]>; + !listconcat(FeatureISAVersion9_0_MI_Common.Features, + [HalfRate64Ops, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureAtomicBufferGlobalPkAddF16NoRtnInsts, + FeatureMFMAInlineLiteralBug, + FeatureImageGather4D16Bug])>; def FeatureISAVersion9_0_9 : FeatureSet< - [FeatureGFX9, - FeatureMadMixInsts, - FeatureLDSBankCount32, - FeatureDsSrc2Insts, - FeatureExtendedImageInsts, - FeatureImageInsts, - FeatureMadMacF32Insts, - FeatureImageGather4D16Bug]>; + !listconcat(FeatureISAVersion9_0_Common.Features, + [FeatureMadMixInsts, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureImageInsts, + FeatureImageGather4D16Bug])>; def FeatureISAVersion9_0_A : FeatureSet< - [FeatureGFX9, - FeatureGFX90AInsts, - FeatureFmaMixInsts, - FeatureLDSBankCount32, - FeatureDLInsts, - FeatureFmacF64Inst, - FeatureDot1Insts, - FeatureDot2Insts, - FeatureDot3Insts, - FeatureDot4Insts, - FeatureDot5Insts, - FeatureDot6Insts, - FeatureDot7Insts, - Feature64BitDPP, - FeaturePackedFP32Ops, - FeatureMAIInsts, - FeaturePkFmacF16Inst, - FeatureAtomicFaddRtnInsts, - FeatureAtomicFaddNoRtnInsts, - FeatureAtomicPkFaddNoRtnInsts, - FeatureImageInsts, - FeatureMadMacF32Insts, - FeatureSupportsSRAMECC, - FeaturePackedTID, - FullRate64Ops, - FeatureBackOffBarrier]>; + !listconcat(FeatureISAVersion9_0_MI_Common.Features, + [FeatureGFX90AInsts, + FeatureFmacF64Inst, + Feature64BitDPP, + FeaturePackedFP32Ops, + FeatureAtomicFaddRtnInsts, + FeatureAtomicBufferGlobalPkAddF16Insts, + FeaturePackedTID, + FullRate64Ops, + FeatureBackOffBarrier])>; def FeatureISAVersion9_0_C : FeatureSet< - [FeatureGFX9, - FeatureMadMixInsts, - FeatureLDSBankCount32, - FeatureDsSrc2Insts, - FeatureExtendedImageInsts, - FeatureImageInsts, - FeatureMadMacF32Insts, - FeatureImageGather4D16Bug]>; + !listconcat(FeatureISAVersion9_0_Common.Features, + [FeatureMadMixInsts, + FeatureDsSrc2Insts, + FeatureExtendedImageInsts, + FeatureImageGather4D16Bug])>; -def FeatureISAVersion9_4_0 : FeatureSet< +def FeatureISAVersion9_4_Common : FeatureSet< [FeatureGFX9, FeatureGFX90AInsts, FeatureGFX940Insts, @@ -1172,6 +1188,9 @@ def FeatureISAVersion9_4_0 : FeatureSet< FeatureDot5Insts, FeatureDot6Insts, FeatureDot7Insts, + FeatureDot10Insts, + FeatureAtomicDsPkAdd16Insts, + FeatureAtomicFlatPkAdd16Insts, Feature64BitDPP, FeaturePackedFP32Ops, FeatureMAIInsts, @@ -1179,7 +1198,8 @@ def FeatureISAVersion9_4_0 : FeatureSet< FeaturePkFmacF16Inst, FeatureAtomicFaddRtnInsts, FeatureAtomicFaddNoRtnInsts, - FeatureAtomicPkFaddNoRtnInsts, + FeatureAtomicBufferGlobalPkAddF16Insts, + FeatureAtomicGlobalPkAddBF16Inst, FeatureFlatAtomicFaddF32Inst, FeatureSupportsSRAMECC, FeaturePackedTID, @@ -1187,33 +1207,29 @@ def FeatureISAVersion9_4_0 : FeatureSet< FullRate64Ops, FeatureBackOffBarrier]>; -// TODO: Organize more features into groups. -def FeatureGroup { - // Bugs present on gfx10.1. - list<SubtargetFeature> GFX10_1_Bugs = [ - FeatureVcmpxPermlaneHazard, - FeatureVMEMtoScalarWriteHazard, - FeatureSMEMtoVectorWriteHazard, - FeatureInstFwdPrefetchBug, - FeatureVcmpxExecWARHazard, - FeatureLdsBranchVmemWARHazard, - FeatureNSAtoVMEMBug, - FeatureNSAClauseBug, - FeatureOffset3fBug, - FeatureFlatSegmentOffsetBug, - FeatureNegativeUnalignedScratchOffsetBug - ]; -} +def FeatureISAVersion9_4_0 : FeatureSet< + !listconcat(FeatureISAVersion9_4_Common.Features, + [FeatureForceStoreSC0SC1])>; -def FeatureISAVersion10_1_0 : FeatureSet< - !listconcat(FeatureGroup.GFX10_1_Bugs, - [FeatureGFX10, - FeatureLDSBankCount32, - FeatureDLInsts, - FeatureNSAEncoding, - FeatureNSAMaxSize5, - FeatureWavefrontSize32, - FeatureScalarStores, +def FeatureISAVersion9_4_1 : FeatureSet< + !listconcat(FeatureISAVersion9_4_Common.Features, + [FeatureForceStoreSC0SC1])>; + +def FeatureISAVersion9_4_2 : FeatureSet< + !listconcat(FeatureISAVersion9_4_Common.Features, + [])>; + +def FeatureISAVersion10_Common : FeatureSet< + [FeatureGFX10, + FeatureLDSBankCount32, + FeatureDLInsts, + FeatureNSAEncoding, + FeatureWavefrontSize32, + FeatureBackOffBarrier]>; + +def FeatureISAVersion10_1_Common : FeatureSet< + !listconcat(FeatureISAVersion10_Common.Features, + [FeatureScalarStores, FeatureScalarAtomics, FeatureScalarFlatScratchInsts, FeatureGetWaveIdInst, @@ -1221,90 +1237,57 @@ def FeatureISAVersion10_1_0 : FeatureSet< FeatureDsSrc2Insts, FeatureLdsMisalignedBug, FeatureSupportsXNACK, - FeatureBackOffBarrier])>; + // gfx101x bugs + FeatureVcmpxPermlaneHazard, + FeatureVMEMtoScalarWriteHazard, + FeatureSMEMtoVectorWriteHazard, + FeatureInstFwdPrefetchBug, + FeatureVcmpxExecWARHazard, + FeatureLdsBranchVmemWARHazard, + FeatureNSAtoVMEMBug, + FeatureNSAClauseBug, + FeatureOffset3fBug, + FeatureFlatSegmentOffsetBug, + FeatureNegativeUnalignedScratchOffsetBug])>; + +def FeatureISAVersion10_1_0 : FeatureSet< + !listconcat(FeatureISAVersion10_1_Common.Features, + [])>; def FeatureISAVersion10_1_1 : FeatureSet< - !listconcat(FeatureGroup.GFX10_1_Bugs, - [FeatureGFX10, - FeatureLDSBankCount32, - FeatureDLInsts, - FeatureDot1Insts, + !listconcat(FeatureISAVersion10_1_Common.Features, + [FeatureDot1Insts, FeatureDot2Insts, FeatureDot5Insts, FeatureDot6Insts, FeatureDot7Insts, - FeatureNSAEncoding, - FeatureNSAMaxSize5, - FeatureWavefrontSize32, - FeatureScalarStores, - FeatureScalarAtomics, - FeatureScalarFlatScratchInsts, - FeatureGetWaveIdInst, - FeatureMadMacF32Insts, - FeatureDsSrc2Insts, - FeatureLdsMisalignedBug, - FeatureSupportsXNACK, - FeatureBackOffBarrier])>; + FeatureDot10Insts])>; def FeatureISAVersion10_1_2 : FeatureSet< - !listconcat(FeatureGroup.GFX10_1_Bugs, - [FeatureGFX10, - FeatureLDSBankCount32, - FeatureDLInsts, - FeatureDot1Insts, + !listconcat(FeatureISAVersion10_1_Common.Features, + [FeatureDot1Insts, FeatureDot2Insts, FeatureDot5Insts, FeatureDot6Insts, FeatureDot7Insts, - FeatureNSAEncoding, - FeatureNSAMaxSize5, - FeatureWavefrontSize32, - FeatureScalarStores, - FeatureScalarAtomics, - FeatureScalarFlatScratchInsts, - FeatureGetWaveIdInst, - FeatureMadMacF32Insts, - FeatureDsSrc2Insts, - FeatureLdsMisalignedBug, - FeatureSupportsXNACK, - FeatureBackOffBarrier])>; + FeatureDot10Insts])>; def FeatureISAVersion10_1_3 : FeatureSet< - !listconcat(FeatureGroup.GFX10_1_Bugs, - [FeatureGFX10, - FeatureGFX10_AEncoding, - FeatureLDSBankCount32, - FeatureDLInsts, - FeatureNSAEncoding, - FeatureNSAMaxSize5, - FeatureWavefrontSize32, - FeatureScalarStores, - FeatureScalarAtomics, - FeatureScalarFlatScratchInsts, - FeatureGetWaveIdInst, - FeatureMadMacF32Insts, - FeatureDsSrc2Insts, - FeatureLdsMisalignedBug, - FeatureSupportsXNACK, - FeatureBackOffBarrier])>; + !listconcat(FeatureISAVersion10_1_Common.Features, + [FeatureGFX10_AEncoding])>; def FeatureISAVersion10_3_0 : FeatureSet< - [FeatureGFX10, - FeatureGFX10_AEncoding, - FeatureGFX10_BEncoding, - FeatureGFX10_3Insts, - FeatureLDSBankCount32, - FeatureDLInsts, - FeatureDot1Insts, - FeatureDot2Insts, - FeatureDot5Insts, - FeatureDot6Insts, - FeatureDot7Insts, - FeatureNSAEncoding, - FeatureNSAMaxSize13, - FeatureWavefrontSize32, - FeatureShaderCyclesRegister, - FeatureBackOffBarrier]>; + !listconcat(FeatureISAVersion10_Common.Features, + [FeatureGFX10_AEncoding, + FeatureGFX10_BEncoding, + FeatureGFX10_3Insts, + FeatureDot1Insts, + FeatureDot2Insts, + FeatureDot5Insts, + FeatureDot6Insts, + FeatureDot7Insts, + FeatureDot10Insts, + FeatureShaderCyclesRegister])>; def FeatureISAVersion11_Common : FeatureSet< [FeatureGFX11, @@ -1314,8 +1297,9 @@ def FeatureISAVersion11_Common : FeatureSet< FeatureDot7Insts, FeatureDot8Insts, FeatureDot9Insts, + FeatureDot10Insts, FeatureNSAEncoding, - FeatureNSAMaxSize5, + FeaturePartialNSAEncoding, FeatureWavefrontSize32, FeatureShaderCyclesRegister, FeatureArchitectedFlatScratch, @@ -1325,26 +1309,37 @@ def FeatureISAVersion11_Common : FeatureSet< FeatureImageInsts, FeaturePackedTID, FeatureVcmpxPermlaneHazard, - FeatureVALUTransUseHazard, FeatureMADIntraFwdBug]>; -def FeatureISAVersion11_0_0 : FeatureSet< +def FeatureISAVersion11_0_Common : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, + [FeatureVALUTransUseHazard])>; + +def FeatureISAVersion11_0_0 : FeatureSet< + !listconcat(FeatureISAVersion11_0_Common.Features, [FeatureGFX11FullVGPRs, FeatureUserSGPRInit16Bug])>; def FeatureISAVersion11_0_1 : FeatureSet< - !listconcat(FeatureISAVersion11_Common.Features, + !listconcat(FeatureISAVersion11_0_Common.Features, [FeatureGFX11FullVGPRs])>; def FeatureISAVersion11_0_2 : FeatureSet< - !listconcat(FeatureISAVersion11_Common.Features, + !listconcat(FeatureISAVersion11_0_Common.Features, [FeatureUserSGPRInit16Bug])>; def FeatureISAVersion11_0_3 : FeatureSet< + !listconcat(FeatureISAVersion11_0_Common.Features, + [])>; + +def FeatureISAVersion11_5_0 : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, [])>; +def FeatureISAVersion11_5_1 : FeatureSet< + !listconcat(FeatureISAVersion11_Common.Features, + [FeatureGFX11FullVGPRs])>; + //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { @@ -1522,6 +1517,9 @@ def isGFX9Plus : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">, AssemblerPredicate<(all_of FeatureGFX9Insts)>; +def isNotGFX9Plus : + Predicate<"Subtarget->getGeneration() < AMDGPUSubtarget::GFX9">; + def isGFX9Only : Predicate < "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, AssemblerPredicate<(all_of FeatureGCN3Encoding, FeatureGFX9Insts)>; @@ -1655,6 +1653,8 @@ def NotHasTrue16BitInsts : Predicate<"!Subtarget->hasTrue16BitInsts()">; def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, AssemblerPredicate<(all_of FeatureVOP3P)>; +def NotHasMed3_16 : Predicate<"!Subtarget->hasMed3_16()">; + def HasMinMaxDenormModes : Predicate<"Subtarget->supportsMinMaxDenormModes()">; def NotHasMinMaxDenormModes : Predicate<"!Subtarget->supportsMinMaxDenormModes()">; @@ -1766,6 +1766,9 @@ def HasDot8Insts : Predicate<"Subtarget->hasDot8Insts()">, def HasDot9Insts : Predicate<"Subtarget->hasDot9Insts()">, AssemblerPredicate<(all_of FeatureDot9Insts)>; +def HasDot10Insts : Predicate<"Subtarget->hasDot10Insts()">, + AssemblerPredicate<(all_of FeatureDot10Insts)>; + def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">, AssemblerPredicate<(all_of FeatureGetWaveIdInst)>; @@ -1793,13 +1796,25 @@ def HasMadMacF32Insts : Predicate<"Subtarget->hasMadMacF32Insts()">, def HasFmaLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts()">, AssemblerPredicate<(any_of FeatureGFX10_3Insts)>; +def HasAtomicDsPkAdd16Insts : Predicate<"Subtarget->hasAtomicDsPkAdd16Insts()">, + AssemblerPredicate<(any_of FeatureAtomicDsPkAdd16Insts)>; + +def HasAtomicFlatPkAdd16Insts : Predicate<"Subtarget->hasAtomicFlatPkAdd16Insts()">, + AssemblerPredicate<(any_of FeatureAtomicFlatPkAdd16Insts)>; + def HasAtomicFaddRtnInsts : Predicate<"Subtarget->hasAtomicFaddRtnInsts()">, AssemblerPredicate<(all_of FeatureAtomicFaddRtnInsts)>; def HasAtomicFaddNoRtnInsts : Predicate<"Subtarget->hasAtomicFaddNoRtnInsts()">, AssemblerPredicate<(all_of FeatureAtomicFaddNoRtnInsts)>; -def HasAtomicPkFaddNoRtnInsts - : Predicate<"Subtarget->hasAtomicPkFaddNoRtnInsts()">, - AssemblerPredicate<(all_of FeatureAtomicPkFaddNoRtnInsts)>; +def HasAtomicBufferGlobalPkAddF16NoRtnInsts + : Predicate<"Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() || Subtarget->hasAtomicBufferGlobalPkAddF16Insts()">, + AssemblerPredicate<(any_of FeatureAtomicBufferGlobalPkAddF16NoRtnInsts, FeatureAtomicBufferGlobalPkAddF16Insts)>; +def HasAtomicBufferGlobalPkAddF16Insts + : Predicate<"Subtarget->hasAtomicBufferGlobalPkAddF16Insts()">, + AssemblerPredicate<(all_of FeatureAtomicBufferGlobalPkAddF16Insts)>; +def HasAtomicGlobalPkAddBF16Inst + : Predicate<"Subtarget->hasAtomicGlobalPkAddBF16Inst()">, + AssemblerPredicate<(all_of FeatureAtomicGlobalPkAddBF16Inst)>; def HasFlatAtomicFaddF32Inst : Predicate<"Subtarget->hasFlatAtomicFaddF32Inst()">, AssemblerPredicate<(all_of FeatureFlatAtomicFaddF32Inst)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp index 8155c895e366..63942414bf3c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -46,41 +46,14 @@ void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); } -static AliasResult getAliasResult(unsigned AS1, unsigned AS2) { - static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 7, "Addr space out of range"); - - if (AS1 > AMDGPUAS::MAX_AMDGPU_ADDRESS || AS2 > AMDGPUAS::MAX_AMDGPU_ADDRESS) - return AliasResult::MayAlias; - -#define ASMay AliasResult::MayAlias -#define ASNo AliasResult::NoAlias - // This array is indexed by address space value enum elements 0 ... to 7 - static const AliasResult ASAliasRules[8][8] = { - /* Flat Global Region Group Constant Private Const32 Buf Fat Ptr */ - /* Flat */ {ASMay, ASMay, ASNo, ASMay, ASMay, ASMay, ASMay, ASMay}, - /* Global */ {ASMay, ASMay, ASNo, ASNo, ASMay, ASNo, ASMay, ASMay}, - /* Region */ {ASNo, ASNo, ASMay, ASNo, ASNo, ASNo, ASNo, ASNo}, - /* Group */ {ASMay, ASNo, ASNo, ASMay, ASNo, ASNo, ASNo, ASNo}, - /* Constant */ {ASMay, ASMay, ASNo, ASNo, ASNo, ASNo, ASMay, ASMay}, - /* Private */ {ASMay, ASNo, ASNo, ASNo, ASNo, ASMay, ASNo, ASNo}, - /* Constant 32-bit */ {ASMay, ASMay, ASNo, ASNo, ASMay, ASNo, ASNo, ASMay}, - /* Buffer Fat Ptr */ {ASMay, ASMay, ASNo, ASNo, ASMay, ASNo, ASMay, ASMay} - }; -#undef ASMay -#undef ASNo - - return ASAliasRules[AS1][AS2]; -} - AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA, const MemoryLocation &LocB, AAQueryInfo &AAQI, const Instruction *) { unsigned asA = LocA.Ptr->getType()->getPointerAddressSpace(); unsigned asB = LocB.Ptr->getType()->getPointerAddressSpace(); - AliasResult Result = getAliasResult(asA, asB); - if (Result == AliasResult::NoAlias) - return Result; + if (!AMDGPU::addrspacesMayAlias(asA, asB)) + return AliasResult::NoAlias; // In general, FLAT (generic) pointers could be aliased to LOCAL or PRIVATE // pointers. However, as LOCAL or PRIVATE pointers point to local objects, in diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index 2e24e9f929d2..b53def912ab6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -127,7 +127,7 @@ static bool alwaysInlineImpl(Module &M, bool GlobalOpt) { unsigned AS = GV.getAddressSpace(); if ((AS == AMDGPUAS::REGION_ADDRESS) || (AS == AMDGPUAS::LOCAL_ADDRESS && - (!AMDGPUTargetMachine::EnableLowerModuleLDS || !GV.hasInitializer()))) + (!AMDGPUTargetMachine::EnableLowerModuleLDS))) recursivelyVisitUsers(GV, FuncsToAlwaysInline); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index 74be0336851c..6a409f0dcbe7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -16,8 +16,8 @@ #include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDGPUMemoryUtils.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/IR/InstVisitor.h" #include "llvm/InitializePasses.h" @@ -29,7 +29,7 @@ namespace { class AMDGPUAnnotateUniformValues : public FunctionPass, public InstVisitor<AMDGPUAnnotateUniformValues> { - LegacyDivergenceAnalysis *DA; + UniformityInfo *UA; MemorySSA *MSSA; AliasAnalysis *AA; bool isEntryFunc; @@ -55,7 +55,7 @@ public: return "AMDGPU Annotate Uniform Values"; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LegacyDivergenceAnalysis>(); + AU.addRequired<UniformityInfoWrapperPass>(); AU.addRequired<MemorySSAWrapperPass>(); AU.addRequired<AAResultsWrapperPass>(); AU.setPreservesAll(); @@ -69,7 +69,7 @@ public: INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE, "Add AMDGPU uniform metadata", false, false) -INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, @@ -78,13 +78,13 @@ INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, char AMDGPUAnnotateUniformValues::ID = 0; void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) { - if (DA->isUniform(&I)) + if (UA->isUniform(&I)) setUniformMetadata(&I); } void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { Value *Ptr = I.getPointerOperand(); - if (!DA->isUniform(Ptr)) + if (!UA->isUniform(Ptr)) return; Instruction *PtrI = dyn_cast<Instruction>(Ptr); if (PtrI) @@ -108,7 +108,7 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) { if (skipFunction(F)) return false; - DA = &getAnalysis<LegacyDivergenceAnalysis>(); + UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index c916d5d547c4..7cd8e53e6521 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -38,9 +38,9 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/AMDHSAKernelDescriptor.h" -#include "llvm/Support/TargetParser.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/TargetParser/TargetParser.h" using namespace llvm; using namespace llvm::AMDGPU; @@ -65,7 +65,7 @@ using namespace llvm::AMDGPU; // We want to use these instructions, and using fp32 denormals also causes // instructions to run at the double precision rate for the device so it's // probably best to just report no single precision denormals. -static uint32_t getFPMode(AMDGPU::SIModeRegisterDefaults Mode) { +static uint32_t getFPMode(SIModeRegisterDefaults Mode) { return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) | @@ -78,8 +78,8 @@ createAMDGPUAsmPrinterPass(TargetMachine &tm, return new AMDGPUAsmPrinter(tm, std::move(Streamer)); } -extern "C" void LLVM_EXTERNAL_VISIBILITY LLVMInitializeAMDGPUAsmPrinter() { - TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(), +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter() { + TargetRegistry::RegisterAsmPrinter(getTheR600Target(), llvm::createR600AsmPrinterPass); TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(), createAMDGPUAsmPrinterPass); @@ -89,18 +89,6 @@ AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) : AsmPrinter(TM, std::move(Streamer)) { assert(OutStreamer && "AsmPrinter constructed without streamer"); - - if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { - if (isHsaAbiVersion2(getGlobalSTI())) { - HSAMetadataStream.reset(new HSAMD::MetadataStreamerYamlV2()); - } else if (isHsaAbiVersion3(getGlobalSTI())) { - HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV3()); - } else if (isHsaAbiVersion5(getGlobalSTI())) { - HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV5()); - } else { - HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4()); - } - } } StringRef AMDGPUAsmPrinter::getPassName() const { @@ -133,7 +121,7 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) { TM.getTargetTriple().getOS() != Triple::AMDPAL) return; - if (isHsaAbiVersion3AndAbove(getGlobalSTI())) + if (CodeObjectVersion >= AMDGPU::AMDHSA_COV3) getTargetStreamer()->EmitDirectiveAMDGCNTarget(); if (TM.getTargetTriple().getOS() == Triple::AMDHSA) @@ -142,7 +130,7 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) { if (TM.getTargetTriple().getOS() == Triple::AMDPAL) getTargetStreamer()->getPALMetadata()->readFromIR(M); - if (isHsaAbiVersion3AndAbove(getGlobalSTI())) + if (CodeObjectVersion >= AMDGPU::AMDHSA_COV3) return; // HSA emits NT_AMD_HSA_CODE_OBJECT_VERSION for code objects v2. @@ -161,7 +149,7 @@ void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) { initTargetStreamer(M); if (TM.getTargetTriple().getOS() != Triple::AMDHSA || - isHsaAbiVersion2(getGlobalSTI())) + CodeObjectVersion == AMDGPU::AMDHSA_COV2) getTargetStreamer()->EmitISAVersion(); // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA). @@ -221,7 +209,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() { if (!MFI.isEntryFunction()) return; - if ((STM.isMesaKernel(F) || isHsaAbiVersion2(getGlobalSTI())) && + if ((STM.isMesaKernel(F) || CodeObjectVersion == AMDGPU::AMDHSA_COV2) && (F.getCallingConv() == CallingConv::AMDGPU_KERNEL || F.getCallingConv() == CallingConv::SPIR_KERNEL)) { amd_kernel_code_t KernelCode; @@ -239,7 +227,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() { return; if (TM.getTargetTriple().getOS() != Triple::AMDHSA || - isHsaAbiVersion2(getGlobalSTI())) + CodeObjectVersion == AMDGPU::AMDHSA_COV2) return; auto &Streamer = getTargetStreamer()->getStreamer(); @@ -263,17 +251,18 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() { STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), CurrentProgramInfo.NumVGPRsForWavesPerEU, CurrentProgramInfo.NumSGPRsForWavesPerEU - - IsaInfo::getNumExtraSGPRs(&STM, - CurrentProgramInfo.VCCUsed, - CurrentProgramInfo.FlatUsed), - CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed); + IsaInfo::getNumExtraSGPRs( + &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, + getTargetStreamer()->getTargetID()->isXnackOnOrAny()), + CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, + CodeObjectVersion); Streamer.popSection(); } void AMDGPUAsmPrinter::emitFunctionEntryLabel() { if (TM.getTargetTriple().getOS() == Triple::AMDHSA && - isHsaAbiVersion3AndAbove(getGlobalSTI())) { + CodeObjectVersion >= AMDGPU::AMDHSA_COV3) { AsmPrinter::emitFunctionEntryLabel(); return; } @@ -343,6 +332,30 @@ void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { AsmPrinter::emitGlobalVariable(GV); } +bool AMDGPUAsmPrinter::doInitialization(Module &M) { + CodeObjectVersion = AMDGPU::getCodeObjectVersion(M); + + if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { + switch (CodeObjectVersion) { + case AMDGPU::AMDHSA_COV2: + HSAMetadataStream.reset(new HSAMD::MetadataStreamerYamlV2()); + break; + case AMDGPU::AMDHSA_COV3: + HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV3()); + break; + case AMDGPU::AMDHSA_COV4: + HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4()); + break; + case AMDGPU::AMDHSA_COV5: + HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV5()); + break; + default: + report_fatal_error("Unexpected code object version"); + } + } + return AsmPrinter::doInitialization(M); +} + bool AMDGPUAsmPrinter::doFinalization(Module &M) { // Pad with s_code_end to help tools and guard against instruction prefetch // causing stale data in caches. Arguably this should be done by the linker, @@ -389,7 +402,7 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; } - if (MFI.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) { + if (MFI.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; } @@ -411,9 +424,8 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( } if (CurrentProgramInfo.DynamicCallStack && - AMDGPU::getAmdhsaCodeObjectVersion() >= 5) { + CodeObjectVersion >= AMDGPU::AMDHSA_COV5) KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK; - } return KernelCodeProperties; } @@ -429,7 +441,7 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor( assert(isUInt<32>(PI.ScratchSize)); assert(isUInt<32>(PI.getComputePGMRSrc1())); - assert(isUInt<32>(PI.ComputePGMRSrc2)); + assert(isUInt<32>(PI.getComputePGMRSrc2())); KernelDescriptor.group_segment_fixed_size = PI.LDSSize; KernelDescriptor.private_segment_fixed_size = PI.ScratchSize; @@ -438,7 +450,7 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor( KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(); - KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2; + KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(); KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF); assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0); @@ -567,28 +579,27 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->emitRawComment( " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); - OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:SCRATCH_EN: " + - Twine(G_00B84C_SCRATCH_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); - OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:USER_SGPR: " + - Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false); - OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + - Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false); - OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:TGID_X_EN: " + - Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); - OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:TGID_Y_EN: " + - Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); - OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:TGID_Z_EN: " + - Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); - OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + - Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)), - false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:SCRATCH_EN: " + + Twine(CurrentProgramInfo.ScratchEnable), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + + Twine(CurrentProgramInfo.UserSGPR), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + + Twine(CurrentProgramInfo.TrapHandlerEnable), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " + + Twine(CurrentProgramInfo.TGIdXEnable), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " + + Twine(CurrentProgramInfo.TGIdYEnable), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " + + Twine(CurrentProgramInfo.TGIdZEnable), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + + Twine(CurrentProgramInfo.TIdIGCompCount), + false); assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0); @@ -631,7 +642,7 @@ void AMDGPUAsmPrinter::initializeTargetID(const Module &M) { // In the beginning all features are either 'Any' or 'NotSupported', // depending on global target features. This will cover empty modules. getTargetStreamer()->initializeTargetID( - *getGlobalSTI(), getGlobalSTI()->getFeatureString()); + *getGlobalSTI(), getGlobalSTI()->getFeatureString(), CodeObjectVersion); // If module is empty, we are done. if (M.empty()) @@ -709,7 +720,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be // unified. unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs( - &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed); + &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed, + getTargetStreamer()->getTargetID()->isXnackOnOrAny()); // Check the addressable register limit before we add ExtraSGPRs. if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && @@ -761,7 +773,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // There are some rare circumstances where InputAddr is non-zero and // InputEna can be set to 0. In this case we default to setting LastEna // to 1. - LastEna = InputEna ? findLastSet(InputEna) + 1 : 1; + LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1; } // FIXME: We should be using the number of registers determined during @@ -909,22 +921,21 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // anything to disable it if we know the stack isn't used here. We may still // have emitted code reading it to initialize scratch, but if that's unused // reading garbage should be OK. - const bool EnablePrivateSegment = + ProgInfo.ScratchEnable = ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack; - ProgInfo.ComputePGMRSrc2 = - S_00B84C_SCRATCH_EN(EnablePrivateSegment) | - S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | - // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP. - S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) | - S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) | - S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) | - S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) | - S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) | - S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) | - S_00B84C_EXCP_EN_MSB(0) | - // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP. - S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) | - S_00B84C_EXCP_EN(0); + ProgInfo.UserSGPR = MFI->getNumUserSGPRs(); + // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP. + ProgInfo.TrapHandlerEnable = + STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled(); + ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX(); + ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY(); + ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ(); + ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo(); + ProgInfo.TIdIGCompCount = TIDIGCompCnt; + ProgInfo.EXCPEnMSB = 0; + // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP. + ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks; + ProgInfo.EXCPEnable = 0; if (STM.hasGFX90AInsts()) { AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A, @@ -965,7 +976,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1()); OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2); - OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2); + OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2()); OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE); OutStreamer->emitInt32( @@ -1025,25 +1036,77 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, } MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU); - MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC)); - if (AMDGPU::isCompute(CC)) { - MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2); + if (MD->getPALMajorVersion() < 3) { + MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC)); + if (AMDGPU::isCompute(CC)) { + MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2()); + } else { + if (CurrentProgramInfo.ScratchBlocks > 0) + MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1)); + } } else { - if (CurrentProgramInfo.ScratchBlocks > 0) - MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1)); + MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode); + MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode); + MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode); + MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered); + + if (AMDGPU::isCompute(CC)) { + MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable); + MD->setHwStage(CC, ".trap_present", + (bool)CurrentProgramInfo.TrapHandlerEnable); + + // EXCPEnMSB? + const unsigned LdsDwGranularity = 128; + MD->setHwStage(CC, ".lds_size", + (unsigned)(CurrentProgramInfo.LdsSize * LdsDwGranularity * + sizeof(uint32_t))); + MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable); + } else { + MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable); + } } + // ScratchSize is in bytes, 16 aligned. MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16)); if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2) : CurrentProgramInfo.LDSBlocks; - MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); - MD->setSpiPsInputEna(MFI->getPSInputEnable()); - MD->setSpiPsInputAddr(MFI->getPSInputAddr()); + if (MD->getPALMajorVersion() < 3) { + MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize)); + MD->setSpiPsInputEna(MFI->getPSInputEnable()); + MD->setSpiPsInputAddr(MFI->getPSInputAddr()); + } else { + // Graphics registers + const unsigned ExtraLdsDwGranularity = + STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128; + MD->setGraphicsRegisters( + ".ps_extra_lds_size", + (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t))); + + // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr + static StringLiteral const PsInputFields[] = { + ".persp_sample_ena", ".persp_center_ena", + ".persp_centroid_ena", ".persp_pull_model_ena", + ".linear_sample_ena", ".linear_center_ena", + ".linear_centroid_ena", ".line_stipple_tex_ena", + ".pos_x_float_ena", ".pos_y_float_ena", + ".pos_z_float_ena", ".pos_w_float_ena", + ".front_face_ena", ".ancillary_ena", + ".sample_coverage_ena", ".pos_fixed_pt_ena"}; + unsigned PSInputEna = MFI->getPSInputEnable(); + unsigned PSInputAddr = MFI->getPSInputAddr(); + for (auto [Idx, Field] : enumerate(PsInputFields)) { + MD->setGraphicsRegisters(".spi_ps_input_ena", Field, + (bool)((PSInputEna >> Idx) & 1)); + MD->setGraphicsRegisters(".spi_ps_input_addr", Field, + (bool)((PSInputAddr >> Idx) & 1)); + } + } } - if (STM.isWave32()) + // For version 3 and above the wave front size is already set in the metadata + if (MD->getPALMajorVersion() < 3 && STM.isWave32()) MD->setWave32(MF.getFunction().getCallingConv()); } @@ -1055,7 +1118,7 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { // Set compute registers MD->setRsrc1(CallingConv::AMDGPU_CS, CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS)); - MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.ComputePGMRSrc2); + MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.getComputePGMRSrc2()); // Set optional info MD->setFunctionLdsSize(MF, CurrentProgramInfo.LDSSize); @@ -1091,7 +1154,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, Out.compute_pgm_resource_registers = CurrentProgramInfo.getComputePGMRSrc1() | - (CurrentProgramInfo.ComputePGMRSrc2 << 32); + (CurrentProgramInfo.getComputePGMRSrc2() << 32); Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64; if (CurrentProgramInfo.DynamicCallStack) @@ -1109,7 +1172,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, if (MFI->hasDispatchPtr()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; - if (MFI->hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) + if (MFI->hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; if (MFI->hasKernargSegmentPtr()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index ea12086751a4..d490209ce35e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -39,6 +39,7 @@ struct kernel_descriptor_t; class AMDGPUAsmPrinter final : public AsmPrinter { private: + unsigned CodeObjectVersion; void initializeTargetID(const Module &M); AMDGPUResourceUsageAnalysis *ResourceUsage; @@ -90,6 +91,7 @@ public: AMDGPUTargetStreamer* getTargetStreamer() const; + bool doInitialization(Module &M) override; bool doFinalization(Module &M) override; bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 28967bb8e5b1..9795928094f4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -10,12 +10,21 @@ /// This pass optimizes atomic operations by using a single lane of a wavefront /// to perform the atomic operation, thus reducing contention on that memory /// location. -// +/// Atomic optimizer uses following strategies to compute scan and reduced +/// values +/// 1. DPP - +/// This is the most efficient implementation for scan. DPP uses Whole Wave +/// Mode (WWM) +/// 2. Iterative - +// An alternative implementation iterates over all active lanes +/// of Wavefront using llvm.cttz and performs scan using readlane & writelane +/// intrinsics //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "GCNSubtarget.h" -#include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" @@ -38,36 +47,57 @@ struct ReplacementInfo { bool ValDivergent; }; -class AMDGPUAtomicOptimizer : public FunctionPass, - public InstVisitor<AMDGPUAtomicOptimizer> { +class AMDGPUAtomicOptimizer : public FunctionPass { +public: + static char ID; + ScanOptions ScanImpl; + AMDGPUAtomicOptimizer(ScanOptions ScanImpl) + : FunctionPass(ID), ScanImpl(ScanImpl) {} + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<UniformityInfoWrapperPass>(); + AU.addRequired<TargetPassConfig>(); + } +}; + +class AMDGPUAtomicOptimizerImpl + : public InstVisitor<AMDGPUAtomicOptimizerImpl> { private: SmallVector<ReplacementInfo, 8> ToReplace; - const LegacyDivergenceAnalysis *DA; + const UniformityInfo *UA; const DataLayout *DL; - DominatorTree *DT; + DomTreeUpdater &DTU; const GCNSubtarget *ST; bool IsPixelShader; + ScanOptions ScanImpl; Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, Value *const Identity) const; Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, Value *const Identity) const; Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; + + std::pair<Value *, Value *> + buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op, + Value *const Identity, Value *V, Instruction &I, + BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const; + void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, bool ValDivergent) const; public: - static char ID; + AMDGPUAtomicOptimizerImpl() = delete; - AMDGPUAtomicOptimizer() : FunctionPass(ID) {} - - bool runOnFunction(Function &F) override; + AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL, + DomTreeUpdater &DTU, const GCNSubtarget *ST, + bool IsPixelShader, ScanOptions ScanImpl) + : UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader), + ScanImpl(ScanImpl) {} - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<LegacyDivergenceAnalysis>(); - AU.addRequired<TargetPassConfig>(); - } + bool run(Function &F); void visitAtomicRMWInst(AtomicRMWInst &I); void visitIntrinsicInst(IntrinsicInst &I); @@ -84,15 +114,56 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { return false; } - DA = &getAnalysis<LegacyDivergenceAnalysis>(); - DL = &F.getParent()->getDataLayout(); + const UniformityInfo *UA = + &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); + const DataLayout *DL = &F.getParent()->getDataLayout(); + DominatorTreeWrapperPass *const DTW = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DT = DTW ? &DTW->getDomTree() : nullptr; + DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr, + DomTreeUpdater::UpdateStrategy::Lazy); + const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); const TargetMachine &TM = TPC.getTM<TargetMachine>(); - ST = &TM.getSubtarget<GCNSubtarget>(F); - IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; + const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F); + + bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; + + return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl) + .run(F); +} + +PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F, + FunctionAnalysisManager &AM) { + + const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F); + const DataLayout *DL = &F.getParent()->getDataLayout(); + + DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F), + DomTreeUpdater::UpdateStrategy::Lazy); + const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F); + + bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; + + bool IsChanged = + AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl) + .run(F); + + if (!IsChanged) { + return PreservedAnalyses::all(); + } + + PreservedAnalyses PA; + PA.preserve<DominatorTreeAnalysis>(); + return PA; +} + +bool AMDGPUAtomicOptimizerImpl::run(Function &F) { + + // Scan option None disables the Pass + if (ScanImpl == ScanOptions::None) { + return false; + } visit(F); @@ -107,7 +178,7 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { return Changed; } -void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { +void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) { // Early exit for unhandled address space atomic instructions. switch (I.getPointerAddressSpace()) { default: @@ -139,11 +210,11 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { // If the pointer operand is divergent, then each lane is doing an atomic // operation on a different address, and we cannot optimize that. - if (DA->isDivergentUse(&I.getOperandUse(PtrIdx))) { + if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) { return; } - const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx)); + const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); // If the value operand is divergent, each lane is contributing a different // value to the atomic calculation. We can only optimize divergent values if @@ -162,7 +233,7 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { ToReplace.push_back(Info); } -void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { +void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) { AtomicRMWInst::BinOp Op; switch (I.getIntrinsicID()) { @@ -170,54 +241,72 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { return; case Intrinsic::amdgcn_buffer_atomic_add: case Intrinsic::amdgcn_struct_buffer_atomic_add: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: case Intrinsic::amdgcn_raw_buffer_atomic_add: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: Op = AtomicRMWInst::Add; break; case Intrinsic::amdgcn_buffer_atomic_sub: case Intrinsic::amdgcn_struct_buffer_atomic_sub: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: case Intrinsic::amdgcn_raw_buffer_atomic_sub: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: Op = AtomicRMWInst::Sub; break; case Intrinsic::amdgcn_buffer_atomic_and: case Intrinsic::amdgcn_struct_buffer_atomic_and: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: case Intrinsic::amdgcn_raw_buffer_atomic_and: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: Op = AtomicRMWInst::And; break; case Intrinsic::amdgcn_buffer_atomic_or: case Intrinsic::amdgcn_struct_buffer_atomic_or: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: case Intrinsic::amdgcn_raw_buffer_atomic_or: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: Op = AtomicRMWInst::Or; break; case Intrinsic::amdgcn_buffer_atomic_xor: case Intrinsic::amdgcn_struct_buffer_atomic_xor: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: case Intrinsic::amdgcn_raw_buffer_atomic_xor: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: Op = AtomicRMWInst::Xor; break; case Intrinsic::amdgcn_buffer_atomic_smin: case Intrinsic::amdgcn_struct_buffer_atomic_smin: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: case Intrinsic::amdgcn_raw_buffer_atomic_smin: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: Op = AtomicRMWInst::Min; break; case Intrinsic::amdgcn_buffer_atomic_umin: case Intrinsic::amdgcn_struct_buffer_atomic_umin: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: case Intrinsic::amdgcn_raw_buffer_atomic_umin: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: Op = AtomicRMWInst::UMin; break; case Intrinsic::amdgcn_buffer_atomic_smax: case Intrinsic::amdgcn_struct_buffer_atomic_smax: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: case Intrinsic::amdgcn_raw_buffer_atomic_smax: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: Op = AtomicRMWInst::Max; break; case Intrinsic::amdgcn_buffer_atomic_umax: case Intrinsic::amdgcn_struct_buffer_atomic_umax: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: case Intrinsic::amdgcn_raw_buffer_atomic_umax: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: Op = AtomicRMWInst::UMax; break; } const unsigned ValIdx = 0; - const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx)); + const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); // If the value operand is divergent, each lane is contributing a different // value to the atomic calculation. We can only optimize divergent values if @@ -231,7 +320,7 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { // If any of the other arguments to the intrinsic are divergent, we can't // optimize the operation. for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { - if (DA->isDivergentUse(&I.getOperandUse(Idx))) { + if (UA->isDivergentUse(I.getOperandUse(Idx))) { return; } } @@ -283,9 +372,10 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op, // Use the builder to create a reduction of V across the wavefront, with all // lanes active, returning the same result in all lanes. -Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B, - AtomicRMWInst::BinOp Op, Value *V, - Value *const Identity) const { +Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, + AtomicRMWInst::BinOp Op, + Value *V, + Value *const Identity) const { Type *const Ty = V->getType(); Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = @@ -328,8 +418,9 @@ Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B, // Use the builder to create an inclusive scan of V across the wavefront, with // all lanes active. -Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, - Value *V, Value *const Identity) const { +Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, + AtomicRMWInst::BinOp Op, Value *V, + Value *const Identity) const { Type *const Ty = V->getType(); Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = @@ -385,8 +476,8 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, // Use the builder to create a shift right of V across the wavefront, with all // lanes active, to turn an inclusive scan into an exclusive scan. -Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V, - Value *const Identity) const { +Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, + Value *const Identity) const { Type *const Ty = V->getType(); Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = @@ -430,6 +521,75 @@ Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V, return V; } +// Use the builder to create an exclusive scan and compute the final reduced +// value using an iterative approach. This provides an alternative +// implementation to DPP which uses WMM for scan computations. This API iterate +// over active lanes to read, compute and update the value using +// readlane and writelane intrinsics. +std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively( + IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V, + Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const { + + auto *Ty = I.getType(); + auto *WaveTy = B.getIntNTy(ST->getWavefrontSize()); + auto *EntryBB = I.getParent(); + auto NeedResult = !I.use_empty(); + + auto *Ballot = + B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue()); + + // Start inserting instructions for ComputeLoop block + B.SetInsertPoint(ComputeLoop); + // Phi nodes for Accumulator, Scan results destination, and Active Lanes + auto *Accumulator = B.CreatePHI(Ty, 2, "Accumulator"); + Accumulator->addIncoming(Identity, EntryBB); + PHINode *OldValuePhi = nullptr; + if (NeedResult) { + OldValuePhi = B.CreatePHI(Ty, 2, "OldValuePhi"); + OldValuePhi->addIncoming(PoisonValue::get(Ty), EntryBB); + } + auto *ActiveBits = B.CreatePHI(WaveTy, 2, "ActiveBits"); + ActiveBits->addIncoming(Ballot, EntryBB); + + // Use llvm.cttz instrinsic to find the lowest remaining active lane. + auto *FF1 = + B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()}); + auto *LaneIdxInt = B.CreateTrunc(FF1, Ty); + + // Get the value required for atomic operation + auto *LaneValue = + B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt}); + + // Perform writelane if intermediate scan results are required later in the + // kernel computations + Value *OldValue = nullptr; + if (NeedResult) { + OldValue = B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {}, + {Accumulator, LaneIdxInt, OldValuePhi}); + OldValuePhi->addIncoming(OldValue, ComputeLoop); + } + + // Accumulate the results + auto *NewAccumulator = buildNonAtomicBinOp(B, Op, Accumulator, LaneValue); + Accumulator->addIncoming(NewAccumulator, ComputeLoop); + + // Set bit to zero of current active lane so that for next iteration llvm.cttz + // return the next active lane + auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1); + + auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1)); + auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask); + ActiveBits->addIncoming(NewActiveBits, ComputeLoop); + + // Branch out of the loop when all lanes are processed. + auto *IsEnd = B.CreateICmpEQ(NewActiveBits, ConstantInt::get(WaveTy, 0)); + B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop); + + B.SetInsertPoint(ComputeEnd); + + return {OldValue, NewAccumulator}; +} + static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, unsigned BitWidth) { switch (Op) { @@ -456,10 +616,10 @@ static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) { return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS); } -void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, - AtomicRMWInst::BinOp Op, - unsigned ValIdx, - bool ValDivergent) const { +void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, + AtomicRMWInst::BinOp Op, + unsigned ValIdx, + bool ValDivergent) const { // Start building just before the instruction. IRBuilder<> B(&I); @@ -479,7 +639,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {}); Instruction *const NonHelperTerminator = - SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); + SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); // Record I's new position as the exit block. PixelExitBB = I.getParent(); @@ -528,36 +688,50 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, const bool NeedResult = !I.use_empty(); + Function *F = I.getFunction(); + LLVMContext &C = F->getContext(); + BasicBlock *ComputeLoop = nullptr; + BasicBlock *ComputeEnd = nullptr; // If we have a divergent value in each lane, we need to combine the value // using DPP. if (ValDivergent) { - // First we need to set all inactive invocations to the identity value, so - // that they can correctly contribute to the final result. - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); - const AtomicRMWInst::BinOp ScanOp = Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; - if (!NeedResult && ST->hasPermLaneX16()) { - // On GFX10 the permlanex16 instruction helps us build a reduction without - // too many readlanes and writelanes, which are generally bad for - // performance. - NewV = buildReduction(B, ScanOp, NewV, Identity); + if (ScanImpl == ScanOptions::DPP) { + // First we need to set all inactive invocations to the identity value, so + // that they can correctly contribute to the final result. + NewV = + B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); + const AtomicRMWInst::BinOp ScanOp = + Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; + if (!NeedResult && ST->hasPermLaneX16()) { + // On GFX10 the permlanex16 instruction helps us build a reduction + // without too many readlanes and writelanes, which are generally bad + // for performance. + NewV = buildReduction(B, ScanOp, NewV, Identity); + } else { + NewV = buildScan(B, ScanOp, NewV, Identity); + if (NeedResult) + ExclScan = buildShiftRight(B, NewV, Identity); + // Read the value from the last lane, which has accumulated the values + // of each active lane in the wavefront. This will be our new value + // which we will provide to the atomic operation. + Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); + assert(TyBitWidth == 32); + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, + {NewV, LastLaneIdx}); + } + // Finally mark the readlanes in the WWM section. + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); + } else if (ScanImpl == ScanOptions::Iterative) { + // Alternative implementation for scan + ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F); + ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F); + std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I, + ComputeLoop, ComputeEnd); } else { - NewV = buildScan(B, ScanOp, NewV, Identity); - if (NeedResult) - ExclScan = buildShiftRight(B, NewV, Identity); - - // Read the value from the last lane, which has accumulated the values of - // each active lane in the wavefront. This will be our new value which we - // will provide to the atomic operation. - Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); - assert(TyBitWidth == 32); - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, - {NewV, LastLaneIdx}); + llvm_unreachable("Atomic Optimzer is disabled for None strategy"); } - - // Finally mark the readlanes in the WWM section. - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); } else { switch (Op) { default: @@ -608,8 +782,39 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, // entry --> single_lane -\ // \------------------> exit Instruction *const SingleLaneTerminator = - SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); + SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); + + // At this point, we have split the I's block to allow one lane in wavefront + // to update the precomputed reduced value. Also, completed the codegen for + // new control flow i.e. iterative loop which perform reduction and scan using + // ComputeLoop and ComputeEnd. + // For the new control flow, we need to move branch instruction i.e. + // terminator created during SplitBlockAndInsertIfThen from I's block to + // ComputeEnd block. We also need to set up predecessor to next block when + // single lane done updating the final reduced value. + BasicBlock *Predecessor = nullptr; + if (ValDivergent && ScanImpl == ScanOptions::Iterative) { + // Move terminator from I's block to ComputeEnd block. + Instruction *Terminator = EntryBB->getTerminator(); + B.SetInsertPoint(ComputeEnd); + Terminator->removeFromParent(); + B.Insert(Terminator); + + // Branch to ComputeLoop Block unconditionally from the I's block for + // iterative approach. + B.SetInsertPoint(EntryBB); + B.CreateBr(ComputeLoop); + // Update the dominator tree for new control flow. + DTU.applyUpdates( + {{DominatorTree::Insert, EntryBB, ComputeLoop}, + {DominatorTree::Insert, ComputeLoop, ComputeEnd}, + {DominatorTree::Delete, EntryBB, SingleLaneTerminator->getParent()}}); + + Predecessor = ComputeEnd; + } else { + Predecessor = EntryBB; + } // Move the IR builder into single_lane next. B.SetInsertPoint(SingleLaneTerminator); @@ -626,7 +831,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, if (NeedResult) { // Create a PHI node to get our new atomic result into the exit block. PHINode *const PHI = B.CreatePHI(Ty, 2); - PHI->addIncoming(PoisonValue::get(Ty), EntryBB); + PHI->addIncoming(PoisonValue::get(Ty), Predecessor); PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); // We need to broadcast the value who was the lowest active lane (the first @@ -660,8 +865,14 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, // from the first lane, to get our lane's index into the atomic result. Value *LaneOffset = nullptr; if (ValDivergent) { - LaneOffset = - B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); + if (ScanImpl == ScanOptions::DPP) { + LaneOffset = + B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); + } else if (ScanImpl == ScanOptions::Iterative) { + LaneOffset = ExclScan; + } else { + llvm_unreachable("Atomic Optimzer is disabled for None strategy"); + } } else { switch (Op) { default: @@ -705,11 +916,11 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, "AMDGPU atomic optimizations", false, false) -INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, "AMDGPU atomic optimizations", false, false) -FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() { - return new AMDGPUAtomicOptimizer(); +FunctionPass *llvm::createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy) { + return new AMDGPUAtomicOptimizer(ScanStrategy); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index f7298b59f0b9..57c873f00a4a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -56,8 +56,8 @@ static constexpr std::pair<ImplicitArgumentMask, // size is 1 for y/z. static ImplicitArgumentMask intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, - bool HasApertureRegs, bool SupportsGetDoorBellID) { - unsigned CodeObjectVersion = AMDGPU::getAmdhsaCodeObjectVersion(); + bool HasApertureRegs, bool SupportsGetDoorBellID, + unsigned CodeObjectVersion) { switch (ID) { case Intrinsic::amdgcn_workitem_id_x: NonKernelOnly = true; @@ -88,7 +88,7 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access // queue_ptr. case Intrinsic::amdgcn_queue_ptr: - NeedsImplicit = (CodeObjectVersion == 5); + NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5); return QUEUE_PTR; case Intrinsic::amdgcn_is_shared: case Intrinsic::amdgcn_is_private: @@ -97,11 +97,13 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, // Under V5, we need implicitarg_ptr + offsets to access private_base or // shared_base. For pre-V5, however, need to access them through queue_ptr + // offsets. - return CodeObjectVersion == 5 ? IMPLICIT_ARG_PTR : QUEUE_PTR; + return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR : + QUEUE_PTR; case Intrinsic::trap: if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4. - return CodeObjectVersion >= 4 ? NOT_IMPLICIT_INPUT : QUEUE_PTR; - NeedsImplicit = (CodeObjectVersion == 5); // Need impicitarg_ptr under V5. + return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT : + QUEUE_PTR; + NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5); return QUEUE_PTR; default: return NOT_IMPLICIT_INPUT; @@ -137,7 +139,9 @@ public: AMDGPUInformationCache(const Module &M, AnalysisGetter &AG, BumpPtrAllocator &Allocator, SetVector<Function *> *CGSCC, TargetMachine &TM) - : InformationCache(M, AG, Allocator, CGSCC), TM(TM) {} + : InformationCache(M, AG, Allocator, CGSCC), TM(TM), + CodeObjectVersion(AMDGPU::getCodeObjectVersion(M)) {} + TargetMachine &TM; enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 }; @@ -165,6 +169,34 @@ public: return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()}; } + /// Get code object version. + unsigned getCodeObjectVersion() const { + return CodeObjectVersion; + } + + /// Get the effective value of "amdgpu-waves-per-eu" for the function, + /// accounting for the interaction with the passed value to use for + /// "amdgpu-flat-work-group-size". + std::pair<unsigned, unsigned> + getWavesPerEU(const Function &F, + std::pair<unsigned, unsigned> FlatWorkGroupSize) { + const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); + return ST.getWavesPerEU(F, FlatWorkGroupSize); + } + + std::pair<unsigned, unsigned> + getEffectiveWavesPerEU(const Function &F, + std::pair<unsigned, unsigned> WavesPerEU, + std::pair<unsigned, unsigned> FlatWorkGroupSize) { + const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); + return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize); + } + + unsigned getMaxWavesPerEU(const Function &F) { + const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); + return ST.getMaxWavesPerEU(); + } + private: /// Check if the ConstantExpr \p CE requires the queue pointer. static bool visitConstExpr(const ConstantExpr *CE) { @@ -176,7 +208,8 @@ private: } /// Get the constant access bitmap for \p C. - uint8_t getConstantAccess(const Constant *C) { + uint8_t getConstantAccess(const Constant *C, + SmallPtrSetImpl<const Constant *> &Visited) { auto It = ConstantStatus.find(C); if (It != ConstantStatus.end()) return It->second; @@ -191,10 +224,10 @@ private: for (const Use &U : C->operands()) { const auto *OpC = dyn_cast<Constant>(U); - if (!OpC) + if (!OpC || !Visited.insert(OpC).second) continue; - Result |= getConstantAccess(OpC); + Result |= getConstantAccess(OpC, Visited); } return Result; } @@ -209,7 +242,8 @@ public: if (!IsNonEntryFunc && HasAperture) return false; - uint8_t Access = getConstantAccess(C); + SmallPtrSet<const Constant *, 8> Visited; + uint8_t Access = getConstantAccess(C, Visited); // We need to trap on DS globals in non-entry functions. if (IsNonEntryFunc && (Access & DS_GLOBAL)) @@ -221,6 +255,7 @@ public: private: /// Used to determine if the Constant needs the queue pointer. DenseMap<const Constant *, uint8_t> ConstantStatus; + const unsigned CodeObjectVersion; }; struct AAAMDAttributes @@ -311,11 +346,13 @@ struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize { LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName() << "->" << getAssociatedFunction()->getName() << "\n"); - const auto &CallerInfo = A.getAAFor<AAUniformWorkGroupSize>( + const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>( *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); + if (!CallerInfo) + return false; Change = Change | clampStateAndIndicateChange(this->getState(), - CallerInfo.getState()); + CallerInfo->getState()); return true; }; @@ -333,8 +370,8 @@ struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize { AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size", getAssumed() ? "true" : "false")); - return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, - /* ForceReplace */ true); + return A.manifestAttrs(getIRPosition(), AttrList, + /* ForceReplace */ true); } bool isValidState() const override { @@ -342,7 +379,7 @@ struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize { return true; } - const std::string getAsStr() const override { + const std::string getAsStr(Attributor *) const override { return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]"; } @@ -400,9 +437,9 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { auto OrigAssumed = getAssumed(); // Check for Intrinsics and propagate attributes. - const AACallEdges &AAEdges = A.getAAFor<AACallEdges>( + const AACallEdges *AAEdges = A.getAAFor<AACallEdges>( *this, this->getIRPosition(), DepClassTy::REQUIRED); - if (AAEdges.hasNonAsmUnknownCallee()) + if (!AAEdges || AAEdges->hasNonAsmUnknownCallee()) return indicatePessimisticFixpoint(); bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv()); @@ -411,20 +448,23 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); bool HasApertureRegs = InfoCache.hasApertureRegs(*F); bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F); + unsigned COV = InfoCache.getCodeObjectVersion(); - for (Function *Callee : AAEdges.getOptimisticEdges()) { + for (Function *Callee : AAEdges->getOptimisticEdges()) { Intrinsic::ID IID = Callee->getIntrinsicID(); if (IID == Intrinsic::not_intrinsic) { - const AAAMDAttributes &AAAMD = A.getAAFor<AAAMDAttributes>( - *this, IRPosition::function(*Callee), DepClassTy::REQUIRED); - *this &= AAAMD; + const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>( + *this, IRPosition::function(*Callee), DepClassTy::REQUIRED); + if (!AAAMD) + return indicatePessimisticFixpoint(); + *this &= *AAAMD; continue; } bool NonKernelOnly = false; ImplicitArgumentMask AttrMask = intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit, - HasApertureRegs, SupportsGetDoorbellID); + HasApertureRegs, SupportsGetDoorbellID, COV); if (AttrMask != NOT_IMPLICIT_INPUT) { if ((IsNonEntryFunc || !NonKernelOnly)) removeAssumedBits(AttrMask); @@ -438,29 +478,29 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) { // Under V5, we need implicitarg_ptr + offsets to access private_base or // shared_base. We do not actually need queue_ptr. - if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) + if (COV >= 5) removeAssumedBits(IMPLICIT_ARG_PTR); else removeAssumedBits(QUEUE_PTR); } - if (funcRetrievesMultigridSyncArg(A)) { + if (funcRetrievesMultigridSyncArg(A, COV)) { assert(!isAssumed(IMPLICIT_ARG_PTR) && "multigrid_sync_arg needs implicitarg_ptr"); removeAssumedBits(MULTIGRID_SYNC_ARG); } - if (funcRetrievesHostcallPtr(A)) { + if (funcRetrievesHostcallPtr(A, COV)) { assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr"); removeAssumedBits(HOSTCALL_PTR); } - if (funcRetrievesHeapPtr(A)) { + if (funcRetrievesHeapPtr(A, COV)) { assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr"); removeAssumedBits(HEAP_PTR); } - if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A)) { + if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) { assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr"); removeAssumedBits(QUEUE_PTR); } @@ -469,10 +509,10 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { removeAssumedBits(LDS_KERNEL_ID); } - if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A)) + if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV)) removeAssumedBits(DEFAULT_QUEUE); - if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A)) + if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV)) removeAssumedBits(COMPLETION_ACTION); return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED @@ -488,16 +528,17 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { AttrList.push_back(Attribute::get(Ctx, Attr.second)); } - return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, - /* ForceReplace */ true); + return A.manifestAttrs(getIRPosition(), AttrList, + /* ForceReplace */ true); } - const std::string getAsStr() const override { + const std::string getAsStr(Attributor *) const override { std::string Str; raw_string_ostream OS(Str); OS << "AMDInfo["; for (auto Attr : ImplicitAttrs) - OS << ' ' << Attr.second; + if (isAssumed(Attr.first)) + OS << ' ' << Attr.second; OS << " ]"; return OS.str(); } @@ -557,39 +598,39 @@ private: return false; } - bool funcRetrievesMultigridSyncArg(Attributor &A) { - auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(); + bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) { + auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV); AA::RangeTy Range(Pos, 8); return funcRetrievesImplicitKernelArg(A, Range); } - bool funcRetrievesHostcallPtr(Attributor &A) { - auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(); + bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) { + auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV); AA::RangeTy Range(Pos, 8); return funcRetrievesImplicitKernelArg(A, Range); } - bool funcRetrievesDefaultQueue(Attributor &A) { - auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(); + bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) { + auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV); AA::RangeTy Range(Pos, 8); return funcRetrievesImplicitKernelArg(A, Range); } - bool funcRetrievesCompletionAction(Attributor &A) { - auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(); + bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) { + auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV); AA::RangeTy Range(Pos, 8); return funcRetrievesImplicitKernelArg(A, Range); } - bool funcRetrievesHeapPtr(Attributor &A) { - if (AMDGPU::getAmdhsaCodeObjectVersion() != 5) + bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) { + if (COV < 5) return false; AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8); return funcRetrievesImplicitKernelArg(A, Range); } - bool funcRetrievesQueuePtr(Attributor &A) { - if (AMDGPU::getAmdhsaCodeObjectVersion() != 5) + bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) { + if (COV < 5) return false; AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8); return funcRetrievesImplicitKernelArg(A, Range); @@ -607,10 +648,12 @@ private: if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr) return true; - const auto &PointerInfoAA = A.getAAFor<AAPointerInfo>( + const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>( *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED); + if (!PointerInfoAA) + return false; - return PointerInfoAA.forallInterferingAccesses( + return PointerInfoAA->forallInterferingAccesses( Range, [](const AAPointerInfo::Access &Acc, bool IsExact) { return Acc.getRemoteInst()->isDroppable(); }); @@ -639,42 +682,36 @@ AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP, llvm_unreachable("AAAMDAttributes is only valid for function position"); } -/// Propagate amdgpu-flat-work-group-size attribute. -struct AAAMDFlatWorkGroupSize +/// Base class to derive different size ranges. +struct AAAMDSizeRangeAttribute : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> { using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>; - AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A) - : Base(IRP, 32) {} - /// See AbstractAttribute::getState(...). - IntegerRangeState &getState() override { return *this; } - const IntegerRangeState &getState() const override { return *this; } + StringRef AttrName; - void initialize(Attributor &A) override { - Function *F = getAssociatedFunction(); - auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); - unsigned MinGroupSize, MaxGroupSize; - std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F); - intersectKnown( - ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1))); + AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A, + StringRef AttrName) + : Base(IRP, 32), AttrName(AttrName) {} - if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) - indicatePessimisticFixpoint(); - } + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override {} - ChangeStatus updateImpl(Attributor &A) override { + template <class AttributeImpl> + ChangeStatus updateImplImpl(Attributor &A) { ChangeStatus Change = ChangeStatus::UNCHANGED; auto CheckCallSite = [&](AbstractCallSite CS) { Function *Caller = CS.getInstruction()->getFunction(); - LLVM_DEBUG(dbgs() << "[AAAMDFlatWorkGroupSize] Call " << Caller->getName() + LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName() << "->" << getAssociatedFunction()->getName() << '\n'); - const auto &CallerInfo = A.getAAFor<AAAMDFlatWorkGroupSize>( + const auto *CallerInfo = A.getAAFor<AttributeImpl>( *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); + if (!CallerInfo) + return false; Change |= - clampStateAndIndicateChange(this->getState(), CallerInfo.getState()); + clampStateAndIndicateChange(this->getState(), CallerInfo->getState()); return true; }; @@ -686,45 +723,65 @@ struct AAAMDFlatWorkGroupSize return Change; } - ChangeStatus manifest(Attributor &A) override { - SmallVector<Attribute, 8> AttrList; - Function *F = getAssociatedFunction(); - LLVMContext &Ctx = F->getContext(); - - auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); - unsigned Min, Max; - std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F); - + ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min, + unsigned Max) { // Don't add the attribute if it's the implied default. if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max) return ChangeStatus::UNCHANGED; + Function *F = getAssociatedFunction(); + LLVMContext &Ctx = F->getContext(); SmallString<10> Buffer; raw_svector_ostream OS(Buffer); OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1; - - AttrList.push_back( - Attribute::get(Ctx, "amdgpu-flat-work-group-size", OS.str())); - return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, - /* ForceReplace */ true); + return A.manifestAttrs(getIRPosition(), + {Attribute::get(Ctx, AttrName, OS.str())}, + /* ForceReplace */ true); } - const std::string getAsStr() const override { + const std::string getAsStr(Attributor *) const override { std::string Str; raw_string_ostream OS(Str); - OS << "AMDFlatWorkGroupSize["; + OS << getName() << '['; OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1; OS << ']'; return OS.str(); } +}; - /// See AbstractAttribute::trackStatistics() - void trackStatistics() const override {} +/// Propagate amdgpu-flat-work-group-size attribute. +struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute { + AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A) + : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {} + + void initialize(Attributor &A) override { + Function *F = getAssociatedFunction(); + auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); + unsigned MinGroupSize, MaxGroupSize; + std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F); + intersectKnown( + ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1))); + + if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) + indicatePessimisticFixpoint(); + } + + ChangeStatus updateImpl(Attributor &A) override { + return updateImplImpl<AAAMDFlatWorkGroupSize>(A); + } /// Create an abstract attribute view for the position \p IRP. static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP, Attributor &A); + ChangeStatus manifest(Attributor &A) override { + Function *F = getAssociatedFunction(); + auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); + unsigned Min, Max; + std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F); + return emitAttributeIfNotDefault(A, Min, Max); + } + /// See AbstractAttribute::getName() const std::string getName() const override { return "AAAMDFlatWorkGroupSize"; @@ -754,6 +811,109 @@ AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP, "AAAMDFlatWorkGroupSize is only valid for function position"); } +/// Propagate amdgpu-waves-per-eu attribute. +struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute { + AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A) + : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {} + + bool isValidState() const override { + return !Assumed.isEmptySet() && IntegerRangeState::isValidState(); + } + + void initialize(Attributor &A) override { + Function *F = getAssociatedFunction(); + auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); + + if (const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>( + *this, IRPosition::function(*F), DepClassTy::REQUIRED)) { + + unsigned Min, Max; + std::tie(Min, Max) = InfoCache.getWavesPerEU( + *F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(), + AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1}); + + ConstantRange Range(APInt(32, Min), APInt(32, Max + 1)); + intersectKnown(Range); + } + + if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) + indicatePessimisticFixpoint(); + } + + ChangeStatus updateImpl(Attributor &A) override { + auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); + ChangeStatus Change = ChangeStatus::UNCHANGED; + + auto CheckCallSite = [&](AbstractCallSite CS) { + Function *Caller = CS.getInstruction()->getFunction(); + Function *Func = getAssociatedFunction(); + LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName() + << "->" << Func->getName() << '\n'); + + const auto *CallerInfo = A.getAAFor<AAAMDWavesPerEU>( + *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); + const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>( + *this, IRPosition::function(*Func), DepClassTy::REQUIRED); + if (!CallerInfo || !AssumedGroupSize) + return false; + + unsigned Min, Max; + std::tie(Min, Max) = InfoCache.getEffectiveWavesPerEU( + *Caller, + {CallerInfo->getAssumed().getLower().getZExtValue(), + CallerInfo->getAssumed().getUpper().getZExtValue() - 1}, + {AssumedGroupSize->getAssumed().getLower().getZExtValue(), + AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1}); + ConstantRange CallerRange(APInt(32, Min), APInt(32, Max + 1)); + IntegerRangeState CallerRangeState(CallerRange); + Change |= clampStateAndIndicateChange(this->getState(), CallerRangeState); + + return true; + }; + + bool AllCallSitesKnown = true; + if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) + return indicatePessimisticFixpoint(); + + return Change; + } + + /// Create an abstract attribute view for the position \p IRP. + static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP, + Attributor &A); + + ChangeStatus manifest(Attributor &A) override { + Function *F = getAssociatedFunction(); + auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache()); + unsigned Max = InfoCache.getMaxWavesPerEU(*F); + return emitAttributeIfNotDefault(A, 1, Max); + } + + /// See AbstractAttribute::getName() + const std::string getName() const override { return "AAAMDWavesPerEU"; } + + /// See AbstractAttribute::getIdAddr() + const char *getIdAddr() const override { return &ID; } + + /// This function should return true if the type of the \p AA is + /// AAAMDWavesPerEU + static bool classof(const AbstractAttribute *AA) { + return (AA->getIdAddr() == &ID); + } + + /// Unique ID (due to the unique address) + static const char ID; +}; + +const char AAAMDWavesPerEU::ID = 0; + +AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP, + Attributor &A) { + if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) + return *new (A.Allocator) AAAMDWavesPerEU(IRP, A); + llvm_unreachable("AAAMDWavesPerEU is only valid for function position"); +} + class AMDGPUAttributor : public ModulePass { public: AMDGPUAttributor() : ModulePass(ID) {} @@ -782,13 +942,17 @@ public: AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM); DenseSet<const char *> Allowed( {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, - &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID, - &AAPointerInfo::ID, &AAPotentialConstantValues::ID}); + &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, + &AAAMDWavesPerEU::ID, &AACallEdges::ID, &AAPointerInfo::ID, + &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID}); AttributorConfig AC(CGUpdater); AC.Allowed = &Allowed; AC.IsModulePass = true; AC.DefaultInitializeLiveInternals = false; + AC.IPOAmendableCB = [](const Function &F) { + return F.getCallingConv() == CallingConv::AMDGPU_KERNEL; + }; Attributor A(Functions, InfoCache, AC); @@ -798,6 +962,7 @@ public: A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F)); if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) { A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F)); + A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(F)); } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index da819b6d4a23..9ba5ea8fb73f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -466,7 +466,9 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, CCInfo.AllocateReg(DispatchPtrReg); } - if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) { + const Module *M = MF.getFunction().getParent(); + if (Info.hasQueuePtr() && + AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) { Register QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); @@ -510,8 +512,6 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( const SITargetLowering &TLI = *getTLI<SITargetLowering>(); const DataLayout &DL = F.getParent()->getDataLayout(); - Info->allocateKnownAddressLDSGlobal(F); - SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); @@ -519,7 +519,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel( unsigned i = 0; const Align KernArgBaseAlign(16); - const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); + const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(); uint64_t ExplicitArgOffset = 0; // TODO: Align down to dword alignment and extract bits for extending loads. @@ -594,8 +594,6 @@ bool AMDGPUCallLowering::lowerFormalArguments( const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); const DataLayout &DL = F.getParent()->getDataLayout(); - Info->allocateKnownAddressLDSGlobal(F); - SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); @@ -701,7 +699,7 @@ bool AMDGPUCallLowering::lowerFormalArguments( if ((PsInputBits & 0x7F) == 0 || ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1))) - Info->markPSInputEnabled(countTrailingZeros(Info->getPSInputAddr())); + Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr())); } } @@ -724,7 +722,7 @@ bool AMDGPUCallLowering::lowerFormalArguments( if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B)) return false; - uint64_t StackOffset = Assigner.StackOffset; + uint64_t StackSize = Assigner.StackSize; // Start adding system SGPRs. if (IsEntryFunc) { @@ -739,7 +737,7 @@ bool AMDGPUCallLowering::lowerFormalArguments( // the caller's stack. So, whenever we lower formal arguments, we should keep // track of this information, since we might lower a tail call in this // function later. - Info->setBytesInStackArgArea(StackOffset); + Info->setBytesInStackArgArea(StackSize); // Move back to the end of the basic block. B.setMBB(MBB); @@ -956,10 +954,14 @@ getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) { } static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, - bool IsTailCall) { + bool IsTailCall, CallingConv::ID CC) { assert(!(IsIndirect && IsTailCall) && "Indirect calls can't be tail calls, " "because the address can be divergent"); - return IsTailCall ? AMDGPU::SI_TCRETURN : AMDGPU::G_SI_CALL; + if (!IsTailCall) + return AMDGPU::G_SI_CALL; + + return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX : + AMDGPU::SI_TCRETURN; } // Add operands to call instruction to track the callee. @@ -1053,7 +1055,7 @@ bool AMDGPUCallLowering::areCalleeOutgoingArgsTailCallable( // Make sure that they can fit on the caller's stack. const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); - if (OutInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) { + if (OutInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) { LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n"); return false; } @@ -1184,7 +1186,7 @@ bool AMDGPUCallLowering::lowerTailCall( if (!IsSibCall) CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP); - unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true); + unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true, CalleeCC); auto MIB = MIRBuilder.buildInstrNoInsert(Opc); if (!addCallTargetOperands(MIB, MIRBuilder, Info)) return false; @@ -1224,7 +1226,7 @@ bool AMDGPUCallLowering::lowerTailCall( // The callee will pop the argument stack as a tail call. Thus, we must // keep it 16-byte aligned. - NumBytes = alignTo(OutInfo.getNextStackOffset(), ST.getStackAlignment()); + NumBytes = alignTo(OutInfo.getStackSize(), ST.getStackAlignment()); // FPDiff will be negative if this tail call requires more space than we // would automatically have in our incoming argument space. Positive if we @@ -1348,7 +1350,7 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Create a temporarily-floating call instruction so we can add the implicit // uses of arg registers. - unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false); + unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false, Info.CallConv); auto MIB = MIRBuilder.buildInstrNoInsert(Opc); MIB.addDef(TRI->getReturnAddressReg(MF)); @@ -1390,7 +1392,7 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, ImplicitArgRegs); // Get a count of how many bytes are to be pushed on the stack. - unsigned NumBytes = CCInfo.getNextStackOffset(); + unsigned NumBytes = CCInfo.getStackSize(); // If Callee is a reg, since it is used by a target specific // instruction, it must have a register class matching the diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 08b29641d14a..4ec85f3c5588 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -14,23 +14,28 @@ #include "AMDGPU.h" #include "AMDGPUTargetMachine.h" +#include "SIModeRegisterDefaults.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicsAMDGPU.h" -#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" #include "llvm/Transforms/Utils/IntegerDivision.h" +#include "llvm/Transforms/Utils/Local.h" #define DEBUG_TYPE "amdgpu-codegenprepare" using namespace llvm; +using namespace llvm::PatternMatch; namespace { @@ -46,6 +51,22 @@ static cl::opt<bool> Widen16BitOps( cl::ReallyHidden, cl::init(true)); +static cl::opt<bool> + ScalarizeLargePHIs("amdgpu-codegenprepare-break-large-phis", + cl::desc("Break large PHI nodes for DAGISel"), + cl::ReallyHidden, cl::init(true)); + +static cl::opt<bool> + ForceScalarizeLargePHIs("amdgpu-codegenprepare-force-break-large-phis", + cl::desc("For testing purposes, always break large " + "PHIs even if it isn't profitable."), + cl::ReallyHidden, cl::init(false)); + +static cl::opt<unsigned> ScalarizeLargePHIsThreshold( + "amdgpu-codegenprepare-break-large-phis-threshold", + cl::desc("Minimum type size in bits for breaking large PHI nodes"), + cl::ReallyHidden, cl::init(32)); + static cl::opt<bool> UseMul24Intrin( "amdgpu-codegenprepare-mul24", cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"), @@ -67,16 +88,30 @@ static cl::opt<bool> DisableIDivExpand( cl::ReallyHidden, cl::init(false)); -class AMDGPUCodeGenPrepare : public FunctionPass, - public InstVisitor<AMDGPUCodeGenPrepare, bool> { +// Disable processing of fdiv so we can better test the backend implementations. +static cl::opt<bool> DisableFDivExpand( + "amdgpu-codegenprepare-disable-fdiv-expansion", + cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, + cl::init(false)); + +class AMDGPUCodeGenPrepareImpl + : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> { +public: const GCNSubtarget *ST = nullptr; + const TargetLibraryInfo *TLInfo = nullptr; AssumptionCache *AC = nullptr; DominatorTree *DT = nullptr; - LegacyDivergenceAnalysis *DA = nullptr; + UniformityInfo *UA = nullptr; Module *Mod = nullptr; const DataLayout *DL = nullptr; bool HasUnsafeFPMath = false; - bool HasFP32Denormals = false; + bool HasFP32DenormalFlush = false; + bool FlowChanged = false; + + DenseMap<const PHINode *, bool> BreakPhiNodesCache; + + bool canBreakPHINode(const PHINode &I); /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to /// binary operation \p V. @@ -102,6 +137,21 @@ class AMDGPUCodeGenPrepare : public FunctionPass, /// false otherwise. bool needsPromotionToI32(const Type *T) const; + /// Return true if \p T is a legal scalar floating point type. + bool isLegalFloatingTy(const Type *T) const; + + /// Wrapper to pass all the arguments to computeKnownFPClass + KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested, + const Instruction *CtxI) const { + return llvm::computeKnownFPClass(V, *DL, Interested, 0, TLInfo, AC, CtxI, + DT); + } + + bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const { + return HasFP32DenormalFlush || + computeKnownFPClass(V, fcSubnormal, CtxI).isKnownNeverSubnormal(); + } + /// Promotes uniform binary operation \p I to equivalent 32 bit binary /// operation. /// @@ -199,41 +249,104 @@ class AMDGPUCodeGenPrepare : public FunctionPass, bool canWidenScalarExtLoad(LoadInst &I) const; -public: - static char ID; + Value *matchFractPat(IntrinsicInst &I); + Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg); + + bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF, + FastMathFlags SqrtFMF) const; - AMDGPUCodeGenPrepare() : FunctionPass(ID) {} + Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den, + FastMathFlags DivFMF, FastMathFlags SqrtFMF, + const Instruction *CtxI) const; + Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den, + FastMathFlags FMF, const Instruction *CtxI) const; + Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den, + float ReqdAccuracy) const; + + Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den, + FastMathFlags DivFMF, FastMathFlags SqrtFMF, + Value *RsqOp, const Instruction *FDiv, + float ReqdAccuracy) const; + + std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder, + Value *Src) const; + + Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src, + bool IsNegative) const; + Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS, + FastMathFlags FMF) const; + +public: bool visitFDiv(BinaryOperator &I); - bool visitXor(BinaryOperator &I); bool visitInstruction(Instruction &I) { return false; } bool visitBinaryOperator(BinaryOperator &I); bool visitLoadInst(LoadInst &I); bool visitICmpInst(ICmpInst &I); bool visitSelectInst(SelectInst &I); + bool visitPHINode(PHINode &I); bool visitIntrinsicInst(IntrinsicInst &I); bool visitBitreverseIntrinsicInst(IntrinsicInst &I); + bool visitMinNum(IntrinsicInst &I); + bool run(Function &F); +}; - bool doInitialization(Module &M) override; - bool runOnFunction(Function &F) override; - - StringRef getPassName() const override { return "AMDGPU IR optimizations"; } +class AMDGPUCodeGenPrepare : public FunctionPass { +private: + AMDGPUCodeGenPrepareImpl Impl; +public: + static char ID; + AMDGPUCodeGenPrepare() : FunctionPass(ID) { + initializeAMDGPUCodeGenPreparePass(*PassRegistry::getPassRegistry()); + } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<LegacyDivergenceAnalysis>(); + AU.addRequired<UniformityInfoWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); // FIXME: Division expansion needs to preserve the dominator tree. if (!ExpandDiv64InIR) AU.setPreservesAll(); - } + } + bool runOnFunction(Function &F) override; + bool doInitialization(Module &M) override; + StringRef getPassName() const override { return "AMDGPU IR optimizations"; } }; } // end anonymous namespace -unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { +bool AMDGPUCodeGenPrepareImpl::run(Function &F) { + bool MadeChange = false; + + Function::iterator NextBB; + for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) { + BasicBlock *BB = &*FI; + NextBB = std::next(FI); + + BasicBlock::iterator Next; + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; + I = Next) { + Next = std::next(I); + + MadeChange |= visit(*I); + + if (Next != E) { // Control flow changed + BasicBlock *NextInstBB = Next->getParent(); + if (NextInstBB != BB) { + BB = NextInstBB; + E = BB->end(); + FE = F.end(); + } + } + } + } + return MadeChange; +} + +unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const { assert(needsPromotionToI32(T) && "T does not need promotion to i32"); if (T->isIntegerTy()) @@ -241,7 +354,7 @@ unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); } -Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { +Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const { assert(needsPromotionToI32(T) && "T does not need promotion to i32"); if (T->isIntegerTy()) @@ -249,17 +362,17 @@ Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T)); } -bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { +bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const { return I.getOpcode() == Instruction::AShr || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; } -bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { +bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const { return isa<ICmpInst>(I.getOperand(0)) ? cast<ICmpInst>(I.getOperand(0))->isSigned() : false; } -bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { +bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const { if (!Widen16BitOps) return false; @@ -279,6 +392,11 @@ bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { return false; } +bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const { + return Ty->isFloatTy() || Ty->isDoubleTy() || + (Ty->isHalfTy() && ST->has16BitInsts()); +} + // Return true if the op promoted to i32 should have nsw set. static bool promotedOpIsNSW(const Instruction &I) { switch (I.getOpcode()) { @@ -307,16 +425,16 @@ static bool promotedOpIsNUW(const Instruction &I) { } } -bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const { +bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const { Type *Ty = I.getType(); const DataLayout &DL = Mod->getDataLayout(); int TySize = DL.getTypeSizeInBits(Ty); Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty); - return I.isSimple() && TySize < 32 && Alignment >= 4 && DA->isUniform(&I); + return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(&I); } -bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { +bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const { assert(needsPromotionToI32(I.getType()) && "I does not need promotion to i32"); @@ -363,7 +481,7 @@ bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { return true; } -bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { +bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const { assert(needsPromotionToI32(I.getOperand(0)->getType()) && "I does not need promotion to i32"); @@ -390,7 +508,7 @@ bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { return true; } -bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { +bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const { assert(needsPromotionToI32(I.getType()) && "I does not need promotion to i32"); @@ -419,7 +537,7 @@ bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { return true; } -bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( +bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32( IntrinsicInst &I) const { assert(I.getIntrinsicID() == Intrinsic::bitreverse && "I must be bitreverse intrinsic"); @@ -445,11 +563,11 @@ bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( return true; } -unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op) const { +unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const { return computeKnownBits(Op, *DL, 0, AC).countMaxActiveBits(); } -unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op) const { +unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const { return ComputeMaxSignificantBits(Op, *DL, 0, AC); } @@ -508,7 +626,7 @@ static Value *getMul24(IRBuilder<> &Builder, Value *LHS, Value *RHS, return Builder.CreateOr(Lo, Builder.CreateShl(Hi, 32)); } -bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const { +bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const { if (I.getOpcode() != Instruction::Mul) return false; @@ -518,7 +636,7 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const { return false; // Prefer scalar if this could be s_mul_i32 - if (DA->isUniform(&I)) + if (UA->isUniform(&I)) return false; Value *LHS = I.getOperand(0); @@ -592,7 +710,7 @@ static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) { return nullptr; } -bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const { +bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const { // Don't do this unless the old select is going away. We want to eliminate the // binary operator, not replace a binop with a select. int SelOpNo = 0; @@ -653,30 +771,191 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const { return true; } +std::pair<Value *, Value *> +AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder, + Value *Src) const { + Type *Ty = Src->getType(); + Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp, + {Ty, Builder.getInt32Ty()}, Src); + Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0}); + + // Bypass the bug workaround for the exponent result since it doesn't matter. + // TODO: Does the bug workaround even really need to consider the exponent + // result? It's unspecified by the spec. + + Value *FrexpExp = + ST->hasFractBug() + ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp, + {Builder.getInt32Ty(), Ty}, Src) + : Builder.CreateExtractValue(Frexp, {1}); + return {FrexpMant, FrexpExp}; +} + +/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals. +Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder, + Value *Src, + bool IsNegative) const { + // Same as for 1.0, but expand the sign out of the constant. + // -1.0 / x -> rcp (fneg x) + if (IsNegative) + Src = Builder.CreateFNeg(Src); + + // The rcp instruction doesn't support denormals, so scale the input + // out of the denormal range and convert at the end. + // + // Expand as 2^-n * (1.0 / (x * 2^n)) + + // TODO: Skip scaling if input is known never denormal and the input + // range won't underflow to denormal. The hard part is knowing the + // result. We need a range check, the result could be denormal for + // 0x1p+126 < den <= 0x1p+127. + + Type *Ty = Src->getType(); + + auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src); + Value *ScaleFactor = Builder.CreateNeg(FrexpExp); + Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant); + return Builder.CreateIntrinsic(Intrinsic::ldexp, {Ty, Builder.getInt32Ty()}, + {Rcp, ScaleFactor}); +} + +/// Emit a 2ulp expansion for fdiv by using frexp for input scaling. +Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, + Value *RHS, + FastMathFlags FMF) const { + // If we have have to work around the fract/frexp bug, we're worse off than + // using the fdiv.fast expansion. The full safe expansion is faster if we have + // fast FMA. + if (HasFP32DenormalFlush && ST->hasFractBug() && !ST->hasFastFMAF32() && + (!FMF.noNaNs() || !FMF.noInfs())) + return nullptr; + + // We're scaling the LHS to avoid a denormal input, and scale the denominator + // to avoid large values underflowing the result. + Type *Ty = LHS->getType(); + + auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS); + + Value *Rcp = + Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS); + + auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS); + Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp); + + // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the + // result. + Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS); + return Builder.CreateIntrinsic(Intrinsic::ldexp, {Ty, Builder.getInt32Ty()}, + {Mul, ExpDiff}); +} + +/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals. +static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, + bool IsNegative) { + // bool need_scale = x < 0x1p-126f; + // float input_scale = need_scale ? 0x1.0p+24f : 1.0f; + // float output_scale = need_scale ? 0x1.0p+12f : 1.0f; + // rsq(x * input_scale) * output_scale; + + Type *Ty = Src->getType(); + APFloat SmallestNormal = + APFloat::getSmallestNormalized(Ty->getFltSemantics()); + Value *NeedScale = + Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal)); + Constant *One = ConstantFP::get(Ty, 1.0); + Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24); + Constant *OutputScale = + ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12); + + Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One); + + Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor); + Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput); + Value *OutputScaleFactor = Builder.CreateSelect( + NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One); + + return Builder.CreateFMul(Rsq, OutputScaleFactor); +} + +bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp, + FastMathFlags DivFMF, + FastMathFlags SqrtFMF) const { + // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp. + if (!DivFMF.allowContract() || !SqrtFMF.allowContract()) + return false; + + // v_rsq_f32 gives 1ulp + return SqrtFMF.approxFunc() || HasUnsafeFPMath || + SqrtOp->getFPAccuracy() >= 1.0f; +} + +Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( + IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF, + FastMathFlags SqrtFMF, const Instruction *CtxI) const { + // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp. + assert(DivFMF.allowContract() && SqrtFMF.allowContract()); + + // rsq_f16 is accurate to 0.51 ulp. + // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed. + // rsq_f64 is never accurate. + const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num); + if (!CLHS) + return nullptr; + + assert(Den->getType()->isFloatTy()); + + bool IsNegative = false; + + // TODO: Handle other numerator values with arcp. + if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) { + // Add in the sqrt flags. + IRBuilder<>::FastMathFlagGuard Guard(Builder); + DivFMF |= SqrtFMF; + Builder.setFastMathFlags(DivFMF); + + if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || + canIgnoreDenormalInput(Den, CtxI)) { + Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den); + // -1.0 / sqrt(x) -> fneg(rsq(x)) + return IsNegative ? Builder.CreateFNeg(Result) : Result; + } + + return emitRsqIEEE1ULP(Builder, Den, IsNegative); + } + + return nullptr; +} + // Optimize fdiv with rcp: // // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is // allowed with unsafe-fp-math or afn. // -// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn. -static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp, - bool RcpIsAccurate, IRBuilder<> &Builder, - Module *Mod) { - - if (!AllowInaccurateRcp && !RcpIsAccurate) - return nullptr; +// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0 +Value * +AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num, + Value *Den, FastMathFlags FMF, + const Instruction *CtxI) const { + // rcp_f16 is accurate to 0.51 ulp. + // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed. + // rcp_f64 is never accurate. + assert(Den->getType()->isFloatTy()); - Type *Ty = Den->getType(); if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) { - if (AllowInaccurateRcp || RcpIsAccurate) { - if (CLHS->isExactlyValue(1.0)) { - Function *Decl = Intrinsic::getDeclaration( - Mod, Intrinsic::amdgcn_rcp, Ty); + bool IsNegative = false; + if (CLHS->isExactlyValue(1.0) || + (IsNegative = CLHS->isExactlyValue(-1.0))) { + Value *Src = Den; + + if (HasFP32DenormalFlush || FMF.approxFunc()) { + // -1.0 / x -> 1.0 / fneg(x) + if (IsNegative) + Src = Builder.CreateFNeg(Src); // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to // the CI documentation has a worst case error of 1 ulp. - // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to - // use it as long as we aren't trying to use denormals. + // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK + // to use it as long as we aren't trying to use denormals. // // v_rcp_f16 and v_rsq_f16 DO support denormals. @@ -684,30 +963,29 @@ static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp, // insert rsq intrinsic here. // 1.0 / x -> rcp(x) - return Builder.CreateCall(Decl, { Den }); + return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src); } - // Same as for 1.0, but expand the sign out of the constant. - if (CLHS->isExactlyValue(-1.0)) { - Function *Decl = Intrinsic::getDeclaration( - Mod, Intrinsic::amdgcn_rcp, Ty); - - // -1.0 / x -> rcp (fneg x) - Value *FNeg = Builder.CreateFNeg(Den); - return Builder.CreateCall(Decl, { FNeg }); - } + // TODO: If the input isn't denormal, and we know the input exponent isn't + // big enough to introduce a denormal we can avoid the scaling. + return emitRcpIEEE1ULP(Builder, Src, IsNegative); } } - if (AllowInaccurateRcp) { - Function *Decl = Intrinsic::getDeclaration( - Mod, Intrinsic::amdgcn_rcp, Ty); - - // Turn into multiply by the reciprocal. + if (FMF.allowReciprocal()) { // x / y -> x * (1.0 / y) - Value *Recip = Builder.CreateCall(Decl, { Den }); + + // TODO: Could avoid denormal scaling and use raw rcp if we knew the output + // will never underflow. + if (HasFP32DenormalFlush || FMF.approxFunc()) { + Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den); + return Builder.CreateFMul(Num, Recip); + } + + Value *Recip = emitRcpIEEE1ULP(Builder, Den, false); return Builder.CreateFMul(Num, Recip); } + return nullptr; } @@ -718,17 +996,14 @@ static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp, // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp. // // NOTE: optimizeWithRcp should be tried first because rcp is the preference. -static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy, - bool HasDenormals, IRBuilder<> &Builder, - Module *Mod) { +Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast( + IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const { // fdiv.fast can achieve 2.5 ULP accuracy. if (ReqdAccuracy < 2.5f) return nullptr; // Only have fdiv.fast for f32. - Type *Ty = Den->getType(); - if (!Ty->isFloatTy()) - return nullptr; + assert(Den->getType()->isFloatTy()); bool NumIsOne = false; if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) { @@ -737,11 +1012,39 @@ static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy, } // fdiv does not support denormals. But 1.0/x is always fine to use it. - if (HasDenormals && !NumIsOne) + // + // TODO: This works for any value with a specific known exponent range, don't + // just limit to constant 1. + if (!HasFP32DenormalFlush && !NumIsOne) return nullptr; - Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); - return Builder.CreateCall(Decl, { Num, Den }); + return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den}); +} + +Value *AMDGPUCodeGenPrepareImpl::visitFDivElement( + IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF, + FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst, + float ReqdDivAccuracy) const { + if (RsqOp) { + Value *Rsq = + optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst); + if (Rsq) + return Rsq; + } + + Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst); + if (Rcp) + return Rcp; + + // In the basic case fdiv_fast has the same instruction count as the frexp div + // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can + // potentially be fused into a user. Also, materialization of the constants + // can be reused for multiple instances. + Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy); + if (FDivFast) + return FDivFast; + + return emitFrexpDiv(Builder, Num, Den, DivFMF); } // Optimizations is performed based on fpmath, fast math flags as well as @@ -759,100 +1062,96 @@ static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy, // 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp. // // NOTE: rcp is the preference in cases that both are legal. -bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { +bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { + if (DisableFDivExpand) + return false; Type *Ty = FDiv.getType()->getScalarType(); + if (!Ty->isFloatTy()) + return false; // The f64 rcp/rsq approximations are pretty inaccurate. We can do an - // expansion around them in codegen. - if (Ty->isDoubleTy()) + // expansion around them in codegen. f16 is good enough to always use. + + const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); + const FastMathFlags DivFMF = FPOp->getFastMathFlags(); + const float ReqdAccuracy = FPOp->getFPAccuracy(); + + // Inaccurate rcp is allowed with unsafe-fp-math or afn. + // + // Defer to codegen to handle this. + // + // TODO: Decide on an interpretation for interactions between afn + arcp + + // !fpmath, and make it consistent between here and codegen. For now, defer + // expansion of afn to codegen. The current interpretation is so aggressive we + // don't need any pre-consideration here when we have better information. A + // more conservative interpretation could use handling here. + const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc(); + if (AllowInaccurateRcp) return false; - // No intrinsic for fdiv16 if target does not support f16. - if (Ty->isHalfTy() && !ST->has16BitInsts()) + // Defer the correct implementations to codegen. + if (ReqdAccuracy < 1.0f) return false; - const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); - const float ReqdAccuracy = FPOp->getFPAccuracy(); + FastMathFlags SqrtFMF; - // Inaccurate rcp is allowed with unsafe-fp-math or afn. - FastMathFlags FMF = FPOp->getFastMathFlags(); - const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc(); + Value *Num = FDiv.getOperand(0); + Value *Den = FDiv.getOperand(1); - // rcp_f16 is accurate for !fpmath >= 1.0ulp. - // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed. - // rcp_f64 is never accurate. - const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) || - (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f); + Value *RsqOp = nullptr; + auto *DenII = dyn_cast<IntrinsicInst>(Den); + if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt && + DenII->hasOneUse()) { + const auto *SqrtOp = cast<FPMathOperator>(DenII); + SqrtFMF = SqrtOp->getFastMathFlags(); + if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF)) + RsqOp = SqrtOp->getOperand(0); + } IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator())); - Builder.setFastMathFlags(FMF); + Builder.setFastMathFlags(DivFMF); Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); - Value *Num = FDiv.getOperand(0); - Value *Den = FDiv.getOperand(1); + SmallVector<Value *, 4> NumVals; + SmallVector<Value *, 4> DenVals; + SmallVector<Value *, 4> RsqDenVals; + extractValues(Builder, NumVals, Num); + extractValues(Builder, DenVals, Den); - Value *NewFDiv = nullptr; - if (auto *VT = dyn_cast<FixedVectorType>(FDiv.getType())) { - NewFDiv = PoisonValue::get(VT); + if (RsqOp) + extractValues(Builder, RsqDenVals, RsqOp); - // FIXME: Doesn't do the right thing for cases where the vector is partially - // constant. This works when the scalarizer pass is run first. - for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { - Value *NumEltI = Builder.CreateExtractElement(Num, I); - Value *DenEltI = Builder.CreateExtractElement(Den, I); - // Try rcp first. - Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp, - RcpIsAccurate, Builder, Mod); - if (!NewElt) // Try fdiv.fast. - NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy, - HasFP32Denormals, Builder, Mod); - if (!NewElt) // Keep the original. - NewElt = Builder.CreateFDiv(NumEltI, DenEltI); + SmallVector<Value *, 4> ResultVals(NumVals.size()); + for (int I = 0, E = NumVals.size(); I != E; ++I) { + Value *NumElt = NumVals[I]; + Value *DenElt = DenVals[I]; + Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr; - NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); - } - } else { // Scalar FDiv. - // Try rcp first. - NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate, - Builder, Mod); - if (!NewFDiv) { // Try fdiv.fast. - NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals, - Builder, Mod); + Value *NewElt = + visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt, + cast<Instruction>(FPOp), ReqdAccuracy); + if (!NewElt) { + // Keep the original, but scalarized. + + // This has the unfortunate side effect of sometimes scalarizing when + // we're not going to do anything. + NewElt = Builder.CreateFDiv(NumElt, DenElt); + if (auto *NewEltInst = dyn_cast<Instruction>(NewElt)) + NewEltInst->copyMetadata(FDiv); } - } - if (NewFDiv) { - FDiv.replaceAllUsesWith(NewFDiv); - NewFDiv->takeName(&FDiv); - FDiv.eraseFromParent(); + ResultVals[I] = NewElt; } - return !!NewFDiv; -} + Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals); -bool AMDGPUCodeGenPrepare::visitXor(BinaryOperator &I) { - // Match the Xor instruction, its type and its operands - IntrinsicInst *IntrinsicCall = dyn_cast<IntrinsicInst>(I.getOperand(0)); - ConstantInt *RHS = dyn_cast<ConstantInt>(I.getOperand(1)); - if (!RHS || !IntrinsicCall || RHS->getSExtValue() != -1) - return visitBinaryOperator(I); - - // Check if the Call is an intrinsic instruction to amdgcn_class intrinsic - // has only one use - if (IntrinsicCall->getIntrinsicID() != Intrinsic::amdgcn_class || - !IntrinsicCall->hasOneUse()) - return visitBinaryOperator(I); - - // "Not" the second argument of the intrinsic call - ConstantInt *Arg = dyn_cast<ConstantInt>(IntrinsicCall->getOperand(1)); - if (!Arg) - return visitBinaryOperator(I); + if (NewVal) { + FDiv.replaceAllUsesWith(NewVal); + NewVal->takeName(&FDiv); + RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLInfo); + } - IntrinsicCall->setOperand( - 1, ConstantInt::get(Arg->getType(), Arg->getZExtValue() ^ 0x3ff)); - I.replaceAllUsesWith(IntrinsicCall); - I.eraseFromParent(); return true; } @@ -882,9 +1181,9 @@ static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) { /// Figure out how many bits are really needed for this division. \p AtLeast is /// an optimization hint to bypass the second ComputeNumSignBits call if we the /// first one is insufficient. Returns -1 on failure. -int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I, - Value *Num, Value *Den, - unsigned AtLeast, bool IsSigned) const { +int AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num, + Value *Den, unsigned AtLeast, + bool IsSigned) const { const DataLayout &DL = Mod->getDataLayout(); unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I); if (LHSSignBits < AtLeast) @@ -903,21 +1202,19 @@ int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I, // The fractional part of a float is enough to accurately represent up to // a 24-bit signed integer. -Value *AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, - BinaryOperator &I, - Value *Num, Value *Den, - bool IsDiv, bool IsSigned) const { +Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder, + BinaryOperator &I, Value *Num, + Value *Den, bool IsDiv, + bool IsSigned) const { int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned); if (DivBits == -1) return nullptr; return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned); } -Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder, - BinaryOperator &I, - Value *Num, Value *Den, - unsigned DivBits, - bool IsDiv, bool IsSigned) const { +Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl( + IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den, + unsigned DivBits, bool IsDiv, bool IsSigned) const { Type *I32Ty = Builder.getInt32Ty(); Num = Builder.CreateTrunc(Num, I32Ty); Den = Builder.CreateTrunc(Den, I32Ty); @@ -1017,8 +1314,9 @@ Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder, // than the general expansion we do here. // TODO: It would be better to just directly handle those optimizations here. -bool AMDGPUCodeGenPrepare::divHasSpecialOptimization( - BinaryOperator &I, Value *Num, Value *Den) const { +bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I, + Value *Num, + Value *Den) const { if (Constant *C = dyn_cast<Constant>(Den)) { // Arbitrary constants get a better expansion as long as a wider mulhi is // legal. @@ -1059,9 +1357,9 @@ static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) { return Builder.CreateAShr(V, Builder.getInt32(31)); } -Value *AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, - BinaryOperator &I, Value *X, - Value *Y) const { +Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder, + BinaryOperator &I, Value *X, + Value *Y) const { Instruction::BinaryOps Opc = I.getOpcode(); assert(Opc == Instruction::URem || Opc == Instruction::UDiv || Opc == Instruction::SRem || Opc == Instruction::SDiv); @@ -1147,7 +1445,7 @@ Value *AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, Value *FloatY = Builder.CreateUIToFP(Y, F32Ty); Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty); Value *RcpY = Builder.CreateCall(Rcp, {FloatY}); - Constant *Scale = ConstantFP::get(F32Ty, BitsToFloat(0x4F7FFFFE)); + Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE)); Value *ScaledY = Builder.CreateFMul(RcpY, Scale); Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty); @@ -1184,9 +1482,9 @@ Value *AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, return Res; } -Value *AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder<> &Builder, - BinaryOperator &I, - Value *Num, Value *Den) const { +Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder, + BinaryOperator &I, Value *Num, + Value *Den) const { if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den)) return nullptr; // Keep it for later optimization. @@ -1215,7 +1513,7 @@ Value *AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder<> &Builder, return nullptr; } -void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator &I) const { +void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const { Instruction::BinaryOps Opc = I.getOpcode(); // Do the general expansion. if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) { @@ -1231,12 +1529,12 @@ void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator &I) const { llvm_unreachable("not a division"); } -bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { +bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { if (foldBinOpIntoSelect(I)) return true; if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && - DA->isUniform(&I) && promoteUniformOpToI32(I)) + UA->isUniform(&I) && promoteUniformOpToI32(I)) return true; if (UseMul24Intrin && replaceMulWithMul24(I)) @@ -1307,6 +1605,7 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { // TODO: We get much worse code in specially handled constant cases. for (BinaryOperator *Div : Div64ToExpand) { expandDivRem64(*Div); + FlowChanged = true; Changed = true; } } @@ -1314,7 +1613,7 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { return Changed; } -bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { +bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) { if (!WidenLoads) return false; @@ -1325,9 +1624,7 @@ bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { Builder.SetCurrentDebugLocation(I.getDebugLoc()); Type *I32Ty = Builder.getInt32Ty(); - Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); - Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); - LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, BitCast); + LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand()); WidenLoad->copyMetadata(I); // If we have range metadata, we need to convert the type, and not make @@ -1362,48 +1659,420 @@ bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { return false; } -bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { +bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) { bool Changed = false; if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && - DA->isUniform(&I)) + UA->isUniform(&I)) Changed |= promoteUniformOpToI32(I); return Changed; } -bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { - bool Changed = false; +bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) { + Value *Cond = I.getCondition(); + Value *TrueVal = I.getTrueValue(); + Value *FalseVal = I.getFalseValue(); + Value *CmpVal; + FCmpInst::Predicate Pred; - if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && - DA->isUniform(&I)) - Changed |= promoteUniformOpToI32(I); + if (ST->has16BitInsts() && needsPromotionToI32(I.getType())) { + if (UA->isUniform(&I)) + return promoteUniformOpToI32(I); + return false; + } - return Changed; + // Match fract pattern with nan check. + if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN()))) + return false; + + FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I); + if (!FPOp) + return false; + + IRBuilder<> Builder(&I); + Builder.setFastMathFlags(FPOp->getFastMathFlags()); + + auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal); + auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal); + + Value *Fract = nullptr; + if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse && + CmpVal == matchFractPat(*IIFalse)) { + // isnan(x) ? x : fract(x) + Fract = applyFractPat(Builder, CmpVal); + } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue && + CmpVal == matchFractPat(*IITrue)) { + // !isnan(x) ? fract(x) : x + Fract = applyFractPat(Builder, CmpVal); + } else + return false; + + Fract->takeName(&I); + I.replaceAllUsesWith(Fract); + RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo); + return true; +} + +static bool areInSameBB(const Value *A, const Value *B) { + const auto *IA = dyn_cast<Instruction>(A); + const auto *IB = dyn_cast<Instruction>(B); + return IA && IB && IA->getParent() == IB->getParent(); +} + +// Helper for breaking large PHIs that returns true when an extractelement on V +// is likely to be folded away by the DAG combiner. +static bool isInterestingPHIIncomingValue(const Value *V) { + const auto *FVT = dyn_cast<FixedVectorType>(V->getType()); + if (!FVT) + return false; + + const Value *CurVal = V; + + // Check for insertelements, keeping track of the elements covered. + BitVector EltsCovered(FVT->getNumElements()); + while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) { + const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2)); + + // Non constant index/out of bounds index -> folding is unlikely. + // The latter is more of a sanity check because canonical IR should just + // have replaced those with poison. + if (!Idx || Idx->getSExtValue() >= FVT->getNumElements()) + return false; + + const auto *VecSrc = IE->getOperand(0); + + // If the vector source is another instruction, it must be in the same basic + // block. Otherwise, the DAGCombiner won't see the whole thing and is + // unlikely to be able to do anything interesting here. + if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE)) + return false; + + CurVal = VecSrc; + EltsCovered.set(Idx->getSExtValue()); + + // All elements covered. + if (EltsCovered.all()) + return true; + } + + // We either didn't find a single insertelement, or the insertelement chain + // ended before all elements were covered. Check for other interesting values. + + // Constants are always interesting because we can just constant fold the + // extractelements. + if (isa<Constant>(CurVal)) + return true; + + // shufflevector is likely to be profitable if either operand is a constant, + // or if either source is in the same block. + // This is because shufflevector is most often lowered as a series of + // insert/extract elements anyway. + if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) { + return isa<Constant>(SV->getOperand(1)) || + areInSameBB(SV, SV->getOperand(0)) || + areInSameBB(SV, SV->getOperand(1)); + } + + return false; } -bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { +bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) { + // Check in the cache, or add an entry for this node. + // + // We init with false because we consider all PHI nodes unbreakable until we + // reach a conclusion. Doing the opposite - assuming they're break-able until + // proven otherwise - can be harmful in some pathological cases so we're + // conservative for now. + const auto [It, DidInsert] = BreakPhiNodesCache.insert({&I, false}); + if (!DidInsert) + return It->second; + + // This function may recurse, so to guard against infinite looping, this PHI + // is conservatively considered unbreakable until we reach a conclusion. + + // Don't break PHIs that have no interesting incoming values. That is, where + // there is no clear opportunity to fold the "extractelement" instructions we + // would add. + // + // Note: IC does not run after this pass, so we're only interested in the + // foldings that the DAG combiner can do. + if (none_of(I.incoming_values(), + [&](Value *V) { return isInterestingPHIIncomingValue(V); })) + return false; + + // Now, check users for unbreakable PHI nodes. If we have an unbreakable PHI + // node as user, we don't want to break this PHI either because it's unlikely + // to be beneficial. We would just explode the vector and reassemble it + // directly, wasting instructions. + // + // In the case where multiple users are PHI nodes, we want at least half of + // them to be breakable. + int Score = 0; + for (const Value *U : I.users()) { + if (const auto *PU = dyn_cast<PHINode>(U)) + Score += canBreakPHINode(*PU) ? 1 : -1; + } + + if (Score < 0) + return false; + + return BreakPhiNodesCache[&I] = true; +} + +/// Helper class for "break large PHIs" (visitPHINode). +/// +/// This represents a slice of a PHI's incoming value, which is made up of: +/// - The type of the slice (Ty) +/// - The index in the incoming value's vector where the slice starts (Idx) +/// - The number of elements in the slice (NumElts). +/// It also keeps track of the NewPHI node inserted for this particular slice. +/// +/// Slice examples: +/// <4 x i64> -> Split into four i64 slices. +/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1] +/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail. +/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1] +class VectorSlice { +public: + VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts) + : Ty(Ty), Idx(Idx), NumElts(NumElts) {} + + Type *Ty = nullptr; + unsigned Idx = 0; + unsigned NumElts = 0; + PHINode *NewPHI = nullptr; + + /// Slice \p Inc according to the information contained within this slice. + /// This is cached, so if called multiple times for the same \p BB & \p Inc + /// pair, it returns the same Sliced value as well. + /// + /// Note this *intentionally* does not return the same value for, say, + /// [%bb.0, %0] & [%bb.1, %0] as: + /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then + /// the value in bb.1 may not be reachable from bb.0 if it's its + /// predecessor.) + /// - We also want to make our extract instructions as local as possible so + /// the DAG has better chances of folding them out. Duplicating them like + /// that is beneficial in that regard. + /// + /// This is both a minor optimization to avoid creating duplicate + /// instructions, but also a requirement for correctness. It is not forbidden + /// for a PHI node to have the same [BB, Val] pair multiple times. If we + /// returned a new value each time, those previously identical pairs would all + /// have different incoming values (from the same block) and it'd cause a "PHI + /// node has multiple entries for the same basic block with different incoming + /// values!" verifier error. + Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) { + Value *&Res = SlicedVals[{BB, Inc}]; + if (Res) + return Res; + + IRBuilder<> B(BB->getTerminator()); + if (Instruction *IncInst = dyn_cast<Instruction>(Inc)) + B.SetCurrentDebugLocation(IncInst->getDebugLoc()); + + if (NumElts > 1) { + SmallVector<int, 4> Mask; + for (unsigned K = Idx; K < (Idx + NumElts); ++K) + Mask.push_back(K); + Res = B.CreateShuffleVector(Inc, Mask, NewValName); + } else + Res = B.CreateExtractElement(Inc, Idx, NewValName); + + return Res; + } + +private: + SmallDenseMap<std::pair<BasicBlock *, Value *>, Value *> SlicedVals; +}; + +bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) { + // Break-up fixed-vector PHIs into smaller pieces. + // Default threshold is 32, so it breaks up any vector that's >32 bits into + // its elements, or into 32-bit pieces (for 8/16 bit elts). + // + // This is only helpful for DAGISel because it doesn't handle large PHIs as + // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg. + // With large, odd-sized PHIs we may end up needing many `build_vector` + // operations with most elements being "undef". This inhibits a lot of + // optimization opportunities and can result in unreasonably high register + // pressure and the inevitable stack spilling. + if (!ScalarizeLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption) + return false; + + FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType()); + if (!FVT || DL->getTypeSizeInBits(FVT) <= ScalarizeLargePHIsThreshold) + return false; + + if (!ForceScalarizeLargePHIs && !canBreakPHINode(I)) + return false; + + std::vector<VectorSlice> Slices; + + Type *EltTy = FVT->getElementType(); + { + unsigned Idx = 0; + // For 8/16 bits type, don't scalarize fully but break it up into as many + // 32-bit slices as we can, and scalarize the tail. + const unsigned EltSize = DL->getTypeSizeInBits(EltTy); + const unsigned NumElts = FVT->getNumElements(); + if (EltSize == 8 || EltSize == 16) { + const unsigned SubVecSize = (32 / EltSize); + Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize); + for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End; + Idx += SubVecSize) + Slices.emplace_back(SubVecTy, Idx, SubVecSize); + } + + // Scalarize all remaining elements. + for (; Idx < NumElts; ++Idx) + Slices.emplace_back(EltTy, Idx, 1); + } + + if (Slices.size() == 1) + return false; + + // Create one PHI per vector piece. The "VectorSlice" class takes care of + // creating the necessary instruction to extract the relevant slices of each + // incoming value. + IRBuilder<> B(I.getParent()); + B.SetCurrentDebugLocation(I.getDebugLoc()); + + unsigned IncNameSuffix = 0; + for (VectorSlice &S : Slices) { + // We need to reset the build on each iteration, because getSlicedVal may + // have inserted something into I's BB. + B.SetInsertPoint(I.getParent()->getFirstNonPHI()); + S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues()); + + for (const auto &[Idx, BB] : enumerate(I.blocks())) { + S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx), + "largephi.extractslice" + + std::to_string(IncNameSuffix++)), + BB); + } + } + + // And replace this PHI with a vector of all the previous PHI values. + Value *Vec = PoisonValue::get(FVT); + unsigned NameSuffix = 0; + for (VectorSlice &S : Slices) { + const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++); + if (S.NumElts > 1) + Vec = + B.CreateInsertVector(FVT, Vec, S.NewPHI, B.getInt64(S.Idx), ValName); + else + Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName); + } + + I.replaceAllUsesWith(Vec); + I.eraseFromParent(); + return true; +} + +bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { switch (I.getIntrinsicID()) { case Intrinsic::bitreverse: return visitBitreverseIntrinsicInst(I); + case Intrinsic::minnum: + return visitMinNum(I); default: return false; } } -bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { +bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) { bool Changed = false; if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && - DA->isUniform(&I)) + UA->isUniform(&I)) Changed |= promoteUniformBitreverseToI32(I); return Changed; } +/// Match non-nan fract pattern. +/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0) +/// +/// If fract is a useful instruction for the subtarget. Does not account for the +/// nan handling; the instruction has a nan check on the input value. +Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) { + if (ST->hasFractBug()) + return nullptr; + + if (I.getIntrinsicID() != Intrinsic::minnum) + return nullptr; + + Type *Ty = I.getType(); + if (!isLegalFloatingTy(Ty->getScalarType())) + return nullptr; + + Value *Arg0 = I.getArgOperand(0); + Value *Arg1 = I.getArgOperand(1); + + const APFloat *C; + if (!match(Arg1, m_APFloat(C))) + return nullptr; + + APFloat One(1.0); + bool LosesInfo; + One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo); + + // Match nextafter(1.0, -1) + One.next(true); + if (One != *C) + return nullptr; + + Value *FloorSrc; + if (match(Arg0, m_FSub(m_Value(FloorSrc), + m_Intrinsic<Intrinsic::floor>(m_Deferred(FloorSrc))))) + return FloorSrc; + return nullptr; +} + +Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder, + Value *FractArg) { + SmallVector<Value *, 4> FractVals; + extractValues(Builder, FractVals, FractArg); + + SmallVector<Value *, 4> ResultVals(FractVals.size()); + + Type *Ty = FractArg->getType()->getScalarType(); + for (unsigned I = 0, E = FractVals.size(); I != E; ++I) { + ResultVals[I] = + Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]}); + } + + return insertValues(Builder, FractArg->getType(), ResultVals); +} + +bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) { + Value *FractArg = matchFractPat(I); + if (!FractArg) + return false; + + // Match pattern for fract intrinsic in contexts where the nan check has been + // optimized out (and hope the knowledge the source can't be nan wasn't lost). + if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, *DL, TLInfo)) + return false; + + IRBuilder<> Builder(&I); + FastMathFlags FMF = I.getFastMathFlags(); + FMF.setNoNaNs(); + Builder.setFastMathFlags(FMF); + + Value *Fract = applyFractPat(Builder, FractArg); + Fract->takeName(&I); + I.replaceAllUsesWith(Fract); + + RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo); + return true; +} + bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { - Mod = &M; - DL = &Mod->getDataLayout(); + Impl.Mod = &M; + Impl.DL = &Impl.Mod->getDataLayout(); return false; } @@ -1416,49 +2085,44 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { return false; const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>(); - ST = &TM.getSubtarget<GCNSubtarget>(F); - AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - DA = &getAnalysis<LegacyDivergenceAnalysis>(); - + Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); + Impl.ST = &TM.getSubtarget<GCNSubtarget>(F); + Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - DT = DTWP ? &DTWP->getDomTree() : nullptr; - - HasUnsafeFPMath = hasUnsafeFPMath(F); - - AMDGPU::SIModeRegisterDefaults Mode(F); - HasFP32Denormals = Mode.allFP32Denormals(); - - bool MadeChange = false; - - Function::iterator NextBB; - for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) { - BasicBlock *BB = &*FI; - NextBB = std::next(FI); - - BasicBlock::iterator Next; - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; I = Next) { - Next = std::next(I); - - MadeChange |= visit(*I); - - if (Next != E) { // Control flow changed - BasicBlock *NextInstBB = Next->getParent(); - if (NextInstBB != BB) { - BB = NextInstBB; - E = BB->end(); - FE = F.end(); - } - } - } - } + Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr; + Impl.HasUnsafeFPMath = hasUnsafeFPMath(F); + SIModeRegisterDefaults Mode(F); + Impl.HasFP32DenormalFlush = + Mode.FP32Denormals == DenormalMode::getPreserveSign(); + return Impl.run(F); +} - return MadeChange; +PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F, + FunctionAnalysisManager &FAM) { + AMDGPUCodeGenPrepareImpl Impl; + Impl.Mod = F.getParent(); + Impl.DL = &Impl.Mod->getDataLayout(); + Impl.TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F); + Impl.ST = &TM.getSubtarget<GCNSubtarget>(F); + Impl.AC = &FAM.getResult<AssumptionAnalysis>(F); + Impl.UA = &FAM.getResult<UniformityInfoAnalysis>(F); + Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); + Impl.HasUnsafeFPMath = hasUnsafeFPMath(F); + SIModeRegisterDefaults Mode(F); + Impl.HasFP32DenormalFlush = + Mode.FP32Denormals == DenormalMode::getPreserveSign(); + PreservedAnalyses PA = PreservedAnalyses::none(); + if (!Impl.FlowChanged) + PA.preserveSet<CFGAnalyses>(); + return Impl.run(F) ? PA : PreservedAnalyses::all(); } INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index c11d4656db3f..892e1eef27a8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -10,31 +10,31 @@ include "llvm/Target/GlobalISel/Combine.td" // TODO: This really belongs after legalization after scalarization. -def fmin_fmax_legacy_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::FMinFMaxLegacyInfo">; +def fmin_fmax_legacy_matchdata : GIDefMatchData<"FMinFMaxLegacyInfo">; let Predicates = [HasFminFmaxLegacy] in def fcmp_select_to_fmin_fmax_legacy : GICombineRule< (defs root:$select, fmin_fmax_legacy_matchdata:$matchinfo), (match (wip_match_opcode G_SELECT):$select, - [{ return PostLegalizerHelper.matchFMinFMaxLegacy(*${select}, ${matchinfo}); }]), - (apply [{ PostLegalizerHelper.applySelectFCmpToFMinToFMaxLegacy(*${select}, ${matchinfo}); }])>; + [{ return matchFMinFMaxLegacy(*${select}, ${matchinfo}); }]), + (apply [{ applySelectFCmpToFMinToFMaxLegacy(*${select}, ${matchinfo}); }])>; def uchar_to_float : GICombineRule< (defs root:$itofp), (match (wip_match_opcode G_UITOFP, G_SITOFP):$itofp, - [{ return PostLegalizerHelper.matchUCharToFloat(*${itofp}); }]), - (apply [{ PostLegalizerHelper.applyUCharToFloat(*${itofp}); }])>; + [{ return matchUCharToFloat(*${itofp}); }]), + (apply [{ applyUCharToFloat(*${itofp}); }])>; def rcp_sqrt_to_rsq : GICombineRule< (defs root:$rcp, build_fn_matchinfo:$matchinfo), (match (wip_match_opcode G_INTRINSIC, G_FSQRT):$rcp, - [{ return PostLegalizerHelper.matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]), + [{ return matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>; -def cvt_f32_ubyteN_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::CvtF32UByteMatchInfo">; +def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">; def cvt_f32_ubyteN : GICombineRule< (defs root:$cvt_f32_ubyteN, cvt_f32_ubyteN_matchdata:$matchinfo), @@ -42,18 +42,18 @@ def cvt_f32_ubyteN : GICombineRule< G_AMDGPU_CVT_F32_UBYTE1, G_AMDGPU_CVT_F32_UBYTE2, G_AMDGPU_CVT_F32_UBYTE3):$cvt_f32_ubyteN, - [{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]), - (apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>; + [{ return matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]), + (apply [{ applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>; -def clamp_i64_to_i16_matchdata : GIDefMatchData<"AMDGPUPreLegalizerCombinerHelper::ClampI64ToI16MatchInfo">; +def clamp_i64_to_i16_matchdata : GIDefMatchData<"ClampI64ToI16MatchInfo">; def clamp_i64_to_i16 : GICombineRule< (defs root:$clamp_i64_to_i16, clamp_i64_to_i16_matchdata:$matchinfo), (match (wip_match_opcode G_TRUNC):$clamp_i64_to_i16, - [{ return PreLegalizerHelper.matchClampI64ToI16(*${clamp_i64_to_i16}, MRI, *MF, ${matchinfo}); }]), - (apply [{ PreLegalizerHelper.applyClampI64ToI16(*${clamp_i64_to_i16}, ${matchinfo}); }])>; + [{ return matchClampI64ToI16(*${clamp_i64_to_i16}, MRI, MF, ${matchinfo}); }]), + (apply [{ applyClampI64ToI16(*${clamp_i64_to_i16}, ${matchinfo}); }])>; -def med3_matchdata : GIDefMatchData<"AMDGPURegBankCombinerHelper::Med3MatchInfo">; +def med3_matchdata : GIDefMatchData<"Med3MatchInfo">; def int_minmax_to_med3 : GICombineRule< (defs root:$min_or_max, med3_matchdata:$matchinfo), @@ -61,8 +61,8 @@ def int_minmax_to_med3 : GICombineRule< G_SMIN, G_UMAX, G_UMIN):$min_or_max, - [{ return RegBankHelper.matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]), - (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>; + [{ return matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]), + (apply [{ applyMed3(*${min_or_max}, ${matchinfo}); }])>; def fp_minmax_to_med3 : GICombineRule< (defs root:$min_or_max, med3_matchdata:$matchinfo), @@ -70,8 +70,8 @@ def fp_minmax_to_med3 : GICombineRule< G_FMINNUM, G_FMAXNUM_IEEE, G_FMINNUM_IEEE):$min_or_max, - [{ return RegBankHelper.matchFPMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]), - (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>; + [{ return matchFPMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]), + (apply [{ applyMed3(*${min_or_max}, ${matchinfo}); }])>; def fp_minmax_to_clamp : GICombineRule< (defs root:$min_or_max, register_matchinfo:$matchinfo), @@ -79,21 +79,21 @@ def fp_minmax_to_clamp : GICombineRule< G_FMINNUM, G_FMAXNUM_IEEE, G_FMINNUM_IEEE):$min_or_max, - [{ return RegBankHelper.matchFPMinMaxToClamp(*${min_or_max}, ${matchinfo}); }]), - (apply [{ RegBankHelper.applyClamp(*${min_or_max}, ${matchinfo}); }])>; + [{ return matchFPMinMaxToClamp(*${min_or_max}, ${matchinfo}); }]), + (apply [{ applyClamp(*${min_or_max}, ${matchinfo}); }])>; def fmed3_intrinsic_to_clamp : GICombineRule< (defs root:$fmed3, register_matchinfo:$matchinfo), - (match (wip_match_opcode G_INTRINSIC):$fmed3, - [{ return RegBankHelper.matchFPMed3ToClamp(*${fmed3}, ${matchinfo}); }]), - (apply [{ RegBankHelper.applyClamp(*${fmed3}, ${matchinfo}); }])>; + (match (wip_match_opcode G_AMDGPU_FMED3):$fmed3, + [{ return matchFPMed3ToClamp(*${fmed3}, ${matchinfo}); }]), + (apply [{ applyClamp(*${fmed3}, ${matchinfo}); }])>; def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">; def remove_fcanonicalize : GICombineRule< (defs root:$fcanonicalize, remove_fcanonicalize_matchinfo:$matchinfo), (match (wip_match_opcode G_FCANONICALIZE):$fcanonicalize, - [{ return PostLegalizerHelper.matchRemoveFcanonicalize(*${fcanonicalize}, ${matchinfo}); }]), + [{ return matchRemoveFcanonicalize(*${fcanonicalize}, ${matchinfo}); }]), (apply [{ Helper.replaceSingleDefInstWithReg(*${fcanonicalize}, ${matchinfo}); }])>; def foldable_fneg_matchdata : GIDefMatchData<"MachineInstr *">; @@ -104,32 +104,56 @@ def foldable_fneg : GICombineRule< [{ return Helper.matchFoldableFneg(*${ffn}, ${matchinfo}); }]), (apply [{ Helper.applyFoldableFneg(*${ffn}, ${matchinfo}); }])>; -// Combines which should only apply on SI/VI +def sign_exension_in_reg_matchdata : GIDefMatchData<"MachineInstr *">; + +def sign_extension_in_reg : GICombineRule< + (defs root:$sign_inreg, sign_exension_in_reg_matchdata:$matchinfo), + (match (wip_match_opcode G_SEXT_INREG):$sign_inreg, + [{ return matchCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }]), + (apply [{ applyCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }])>; + + +let Predicates = [Has16BitInsts, NotHasMed3_16] in { +// For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This +// saves one instruction compared to the promotion. +// +// FIXME: Should have ComplexPattern like in/out matchers +// +// FIXME: We should be able to match either G_AMDGPU_FMED3 or +// G_INTRINSIC @llvm.amdgcn.fmed3. Currently the legalizer will +// replace the intrinsic with G_AMDGPU_FMED3 since we can't write a +// pattern to match it. +def expand_promoted_fmed3 : GICombineRule< + (defs root:$fptrunc_dst), + (match (G_FPTRUNC $fptrunc_dst, $fmed3_dst):$fptrunc, + (G_AMDGPU_FMED3 $fmed3_dst, $src0, $src1, $src2), + [{ return Helper.matchExpandPromotedF16FMed3(*${fptrunc}, ${src0}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }]), + (apply [{ Helper.applyExpandPromotedF16FMed3(*${fptrunc}, ${src0}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }]) +>; + +} // End Predicates = [NotHasMed3_16] + +// Combines which should only apply on SI/CI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; -def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper< - "AMDGPUGenPreLegalizerCombinerHelper", +// Combines which should only apply on VI +def gfx8_combines : GICombineGroup<[expand_promoted_fmed3]>; + +def AMDGPUPreLegalizerCombiner: GICombinerHelper< + "AMDGPUPreLegalizerCombinerImpl", [all_combines, clamp_i64_to_i16, foldable_fneg]> { - let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule"; - let StateClass = "AMDGPUPreLegalizerCombinerHelperState"; - let AdditionalArguments = []; } -def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< - "AMDGPUGenPostLegalizerCombinerHelper", - [all_combines, gfx6gfx7_combines, +def AMDGPUPostLegalizerCombiner: GICombinerHelper< + "AMDGPUPostLegalizerCombinerImpl", + [all_combines, gfx6gfx7_combines, gfx8_combines, uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, - rcp_sqrt_to_rsq]> { - let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; - let StateClass = "AMDGPUPostLegalizerCombinerHelperState"; - let AdditionalArguments = []; + rcp_sqrt_to_rsq, sign_extension_in_reg]> { } -def AMDGPURegBankCombinerHelper : GICombinerHelper< - "AMDGPUGenRegBankCombinerHelper", - [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, +def AMDGPURegBankCombiner : GICombinerHelper< + "AMDGPURegBankCombinerImpl", + [unmerge_merge, unmerge_cst, unmerge_undef, + zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> { - let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; - let StateClass = "AMDGPURegBankCombinerHelperState"; - let AdditionalArguments = []; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp index 069baf748bfa..78fdedc0b511 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp @@ -380,3 +380,56 @@ void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI, MI.eraseFromParent(); } + +// TODO: Should return converted value / extension source and avoid introducing +// intermediate fptruncs in the apply function. +static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI, + Register Reg) { + const MachineInstr *Def = MRI.getVRegDef(Reg); + if (Def->getOpcode() == TargetOpcode::G_FPEXT) { + Register SrcReg = Def->getOperand(1).getReg(); + return MRI.getType(SrcReg) == LLT::scalar(16); + } + + if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) { + APFloat Val = Def->getOperand(1).getFPImm()->getValueAPF(); + bool LosesInfo = true; + Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo); + return !LosesInfo; + } + + return false; +} + +bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI, + Register Src0, + Register Src1, + Register Src2) { + assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC); + Register SrcReg = MI.getOperand(1).getReg(); + if (!MRI.hasOneNonDBGUse(SrcReg) || MRI.getType(SrcReg) != LLT::scalar(32)) + return false; + + return isFPExtFromF16OrConst(MRI, Src0) && isFPExtFromF16OrConst(MRI, Src1) && + isFPExtFromF16OrConst(MRI, Src2); +} + +void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI, + Register Src0, + Register Src1, + Register Src2) { + Builder.setInstrAndDebugLoc(MI); + + // We expect fptrunc (fpext x) to fold out, and to constant fold any constant + // sources. + Src0 = Builder.buildFPTrunc(LLT::scalar(16), Src0).getReg(0); + Src1 = Builder.buildFPTrunc(LLT::scalar(16), Src1).getReg(0); + Src2 = Builder.buildFPTrunc(LLT::scalar(16), Src2).getReg(0); + + LLT Ty = MRI.getType(Src0); + auto A1 = Builder.buildFMinNumIEEE(Ty, Src0, Src1); + auto B1 = Builder.buildFMaxNumIEEE(Ty, Src0, Src1); + auto C1 = Builder.buildFMaxNumIEEE(Ty, A1, Src2); + Builder.buildFMinNumIEEE(MI.getOperand(0), B1, C1); + MI.eraseFromParent(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h index 1d4747136bf7..a933e85ce3ca 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h @@ -1,4 +1,4 @@ -//=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.h -----------------------===// +//=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.h -------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,6 +12,9 @@ /// //===----------------------------------------------------------------------===// +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCOMBINERHELPER_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCOMBINERHELPER_H + #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" @@ -23,4 +26,11 @@ public: bool matchFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo); void applyFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo); + + bool matchExpandPromotedF16FMed3(MachineInstr &MI, Register Src0, + Register Src1, Register Src2); + void applyExpandPromotedF16FMed3(MachineInstr &MI, Register Src0, + Register Src1, Register Src2); }; + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCOMBINERHELPER_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp index ba5a8799792a..a13447586bd4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp @@ -31,15 +31,14 @@ static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) { StringRef InitOrFiniKernelName = "amdgcn.device.init"; if (!IsCtor) InitOrFiniKernelName = "amdgcn.device.fini"; + if (M.getFunction(InitOrFiniKernelName)) + return nullptr; Function *InitOrFiniKernel = Function::createWithDefaultAttr( FunctionType::get(Type::getVoidTy(M.getContext()), false), - GlobalValue::ExternalLinkage, 0, InitOrFiniKernelName, &M); - BasicBlock *InitOrFiniKernelBB = - BasicBlock::Create(M.getContext(), "", InitOrFiniKernel); - ReturnInst::Create(M.getContext(), InitOrFiniKernelBB); - + GlobalValue::WeakODRLinkage, 0, InitOrFiniKernelName, &M); InitOrFiniKernel->setCallingConv(CallingConv::AMDGPU_KERNEL); + InitOrFiniKernel->addFnAttr("amdgpu-flat-work-group-size", "1,1"); if (IsCtor) InitOrFiniKernel->addFnAttr("device-init"); else @@ -47,6 +46,71 @@ static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) { return InitOrFiniKernel; } +// The linker will provide the associated symbols to allow us to traverse the +// global constructors / destructors in priority order. We create the IR +// required to call each callback in this section. This is equivalent to the +// following code. +// +// extern "C" void * __init_array_start[]; +// extern "C" void * __init_array_end[]; +// +// using InitCallback = void(); +// +// void call_init_array_callbacks() { +// for (auto start = __init_array_start; start != __init_array_end; ++start) +// reinterpret_cast<InitCallback *>(*start)(); +// } +static void createInitOrFiniCalls(Function &F, bool IsCtor) { + Module &M = *F.getParent(); + LLVMContext &C = M.getContext(); + + IRBuilder<> IRB(BasicBlock::Create(C, "entry", &F)); + auto *LoopBB = BasicBlock::Create(C, "while.entry", &F); + auto *ExitBB = BasicBlock::Create(C, "while.end", &F); + Type *PtrTy = IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS); + + auto *Begin = M.getOrInsertGlobal( + IsCtor ? "__init_array_start" : "__fini_array_start", + ArrayType::get(PtrTy, 0), [&]() { + return new GlobalVariable( + M, ArrayType::get(PtrTy, 0), + /*isConstant=*/true, GlobalValue::ExternalLinkage, + /*Initializer=*/nullptr, + IsCtor ? "__init_array_start" : "__fini_array_start", + /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal, + /*AddressSpace=*/1); + }); + auto *End = M.getOrInsertGlobal( + IsCtor ? "__init_array_end" : "__fini_array_end", + ArrayType::get(PtrTy, 0), [&]() { + return new GlobalVariable( + M, ArrayType::get(PtrTy, 0), + /*isConstant=*/true, GlobalValue::ExternalLinkage, + /*Initializer=*/nullptr, + IsCtor ? "__init_array_end" : "__fini_array_end", + /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal, + /*AddressSpace=*/1); + }); + + // The constructor type is suppoed to allow using the argument vectors, but + // for now we just call them with no arguments. + auto *CallBackTy = FunctionType::get(IRB.getVoidTy(), {}); + + IRB.CreateCondBr(IRB.CreateICmpNE(Begin, End), LoopBB, ExitBB); + IRB.SetInsertPoint(LoopBB); + auto *CallBackPHI = IRB.CreatePHI(PtrTy, 2, "ptr"); + auto *CallBack = IRB.CreateLoad(CallBackTy->getPointerTo(F.getAddressSpace()), + CallBackPHI, "callback"); + IRB.CreateCall(CallBackTy, CallBack); + auto *NewCallBack = IRB.CreateConstGEP1_64(PtrTy, CallBackPHI, 1, "next"); + auto *EndCmp = IRB.CreateICmpEQ(NewCallBack, End, "end"); + CallBackPHI->addIncoming(Begin, &F.getEntryBlock()); + CallBackPHI->addIncoming(NewCallBack, LoopBB); + IRB.CreateCondBr(EndCmp, ExitBB, LoopBB); + IRB.SetInsertPoint(ExitBB); + IRB.CreateRetVoid(); +} + static bool createInitOrFiniKernel(Module &M, StringRef GlobalName, bool IsCtor) { GlobalVariable *GV = M.getGlobalVariable(GlobalName); @@ -57,18 +121,12 @@ static bool createInitOrFiniKernel(Module &M, StringRef GlobalName, return false; Function *InitOrFiniKernel = createInitOrFiniKernelFunction(M, IsCtor); - IRBuilder<> IRB(InitOrFiniKernel->getEntryBlock().getTerminator()); - - FunctionType *ConstructorTy = InitOrFiniKernel->getFunctionType(); + if (!InitOrFiniKernel) + return false; - for (Value *V : GA->operands()) { - auto *CS = cast<ConstantStruct>(V); - IRB.CreateCall(ConstructorTy, CS->getOperand(1)); - } + createInitOrFiniCalls(*InitOrFiniKernel, IsCtor); appendToUsed(M, {InitOrFiniKernel}); - - GV->eraseFromParent(); return true; } @@ -83,17 +141,15 @@ class AMDGPUCtorDtorLoweringLegacy final : public ModulePass { public: static char ID; AMDGPUCtorDtorLoweringLegacy() : ModulePass(ID) {} - bool runOnModule(Module &M) override { - return lowerCtorsAndDtors(M); - } + bool runOnModule(Module &M) override { return lowerCtorsAndDtors(M); } }; } // End anonymous namespace PreservedAnalyses AMDGPUCtorDtorLoweringPass::run(Module &M, ModuleAnalysisManager &AM) { - lowerCtorsAndDtors(M); - return PreservedAnalyses::all(); + return lowerCtorsAndDtors(M) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); } char AMDGPUCtorDtorLoweringLegacy::ID = 0; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 7e7dbacaac11..37df4f68c265 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -31,6 +31,10 @@ def gi_vop3mods : GIComplexOperandMatcher<s32, "selectVOP3Mods">, GIComplexPatternEquiv<VOP3Mods>; +def gi_vop3modsnoncanonicalizing : + GIComplexOperandMatcher<s32, "selectVOP3ModsNonCanonicalizing">, + GIComplexPatternEquiv<VOP3ModsNonCanonicalizing>; + def gi_vop3_no_mods : GIComplexOperandMatcher<s32, "selectVOP3NoMods">, GIComplexPatternEquiv<VOP3NoMods>; @@ -153,6 +157,10 @@ def gi_vop3_mad_mix_mods : GIComplexOperandMatcher<s64, "selectVOP3PMadMixMods">, GIComplexPatternEquiv<VOP3PMadMixMods>; +def gi_vop3_mad_mix_mods_ext : + GIComplexOperandMatcher<s64, "selectVOP3PMadMixModsExt">, + GIComplexPatternEquiv<VOP3PMadMixModsExt>; + // Separate load nodes are defined to glue m0 initialization in // SelectionDAG. The GISel selector can just insert m0 initialization // directly before selecting a glue-less load, so hide this @@ -227,10 +235,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT, SItbuffer_store>; def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>; // FIXME: Check MMO is atomic -def : GINodeEquiv<G_AMDGPU_ATOMIC_INC, SIatomic_inc>; -def : GINodeEquiv<G_AMDGPU_ATOMIC_DEC, SIatomic_dec>; -def : GINodeEquiv<G_AMDGPU_ATOMIC_INC, atomic_inc_glue>; -def : GINodeEquiv<G_AMDGPU_ATOMIC_DEC, atomic_dec_glue>; +def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>; +def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>; def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, SIatomic_fmin>; def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, SIatomic_fmax>; def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, atomic_load_fmin_glue>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp index 2ffc8b2a3a7b..09930dc9612c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp @@ -10,8 +10,8 @@ #include "GCNSubtarget.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/LowLevelType.h" #include "llvm/IR/Constants.h" -#include "llvm/Support/LowLevelTypeImpl.h" using namespace llvm; using namespace MIPatternMatch; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index a71ba6b77565..dadc0c92ef8b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -418,9 +418,7 @@ void MetadataStreamerYamlV2::emitHiddenKernelArgs(const Function &Func, } if (HiddenArgNumBytes >= 48) { - if (!Func.hasFnAttribute("amdgpu-no-completion-action") && - // FIXME: Hack for runtime bug if we fail to optimize this out - Func.hasFnAttribute("calls-enqueue-kernel")) { + if (!Func.hasFnAttribute("amdgpu-no-completion-action")) { emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenCompletionAction); } else { emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone); @@ -854,9 +852,7 @@ void MetadataStreamerMsgPackV3::emitHiddenKernelArgs( } if (HiddenArgNumBytes >= 48) { - if (!Func.hasFnAttribute("amdgpu-no-completion-action") && - // FIXME: Hack for runtime bug if we fail to optimize this out - Func.hasFnAttribute("calls-enqueue-kernel")) { + if (!Func.hasFnAttribute("amdgpu-no-completion-action")) { emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset, Args); } else { @@ -876,7 +872,8 @@ void MetadataStreamerMsgPackV3::emitHiddenKernelArgs( } msgpack::MapDocNode MetadataStreamerMsgPackV3::getHSAKernelProps( - const MachineFunction &MF, const SIProgramInfo &ProgramInfo) const { + const MachineFunction &MF, const SIProgramInfo &ProgramInfo, + unsigned CodeObjectVersion) const { const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); const Function &F = MF.getFunction(); @@ -890,10 +887,11 @@ msgpack::MapDocNode MetadataStreamerMsgPackV3::getHSAKernelProps( Kern.getDocument()->getNode(ProgramInfo.LDSSize); Kern[".private_segment_fixed_size"] = Kern.getDocument()->getNode(ProgramInfo.ScratchSize); - if (AMDGPU::getAmdhsaCodeObjectVersion() >= 5) + if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5) Kern[".uses_dynamic_stack"] = Kern.getDocument()->getNode(ProgramInfo.DynamicCallStack); - if (AMDGPU::getAmdhsaCodeObjectVersion() >= 5 && STM.supportsWGP()) + + if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5 && STM.supportsWGP()) Kern[".workgroup_processor_mode"] = Kern.getDocument()->getNode(ProgramInfo.WgpMode); @@ -945,10 +943,12 @@ void MetadataStreamerMsgPackV3::end() { void MetadataStreamerMsgPackV3::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) { auto &Func = MF.getFunction(); - auto Kern = getHSAKernelProps(MF, ProgramInfo); + if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL && + Func.getCallingConv() != CallingConv::SPIR_KERNEL) + return; - assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL || - Func.getCallingConv() == CallingConv::SPIR_KERNEL); + auto CodeObjectVersion = AMDGPU::getCodeObjectVersion(*Func.getParent()); + auto Kern = getHSAKernelProps(MF, ProgramInfo, CodeObjectVersion); auto Kernels = getRootMetadata("amdhsa.kernels").getArray(/*Convert=*/true); @@ -1079,9 +1079,7 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs( Offset += 8; // Skipped. } - if (!Func.hasFnAttribute("amdgpu-no-completion-action") && - // FIXME: Hack for runtime bug - Func.hasFnAttribute("calls-enqueue-kernel")) { + if (!Func.hasFnAttribute("amdgpu-no-completion-action")) { emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset, Args); } else { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index 91670b9820a2..7d7080e920f5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -84,7 +84,8 @@ protected: msgpack::ArrayDocNode getWorkGroupDimensions(MDNode *Node) const; msgpack::MapDocNode getHSAKernelProps(const MachineFunction &MF, - const SIProgramInfo &ProgramInfo) const; + const SIProgramInfo &ProgramInfo, + unsigned CodeObjectVersion) const; void emitVersion() override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index fc0df61952e4..ffa6c88f9d41 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -80,6 +80,37 @@ enum class SchedGroupMask { LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) }; +class SchedGroup; + +// InstructionRule class is used to enact a filter which determines whether or +// not an SU maps to a given SchedGroup. It contains complementary data +// structures (e.g Cache) to help those filters. +class InstructionRule { +protected: + const SIInstrInfo *TII; + unsigned SGID; + // A cache made available to the Filter to store SUnits for subsequent + // invocations of the Filter + std::optional<SmallVector<SUnit *, 4>> Cache; + +public: + virtual bool + apply(const SUnit *, const ArrayRef<SUnit *>, + SmallVectorImpl<SchedGroup> &) { + return true; + }; + + InstructionRule(const SIInstrInfo *TII, unsigned SGID, + bool NeedsCache = false) + : TII(TII), SGID(SGID) { + if (NeedsCache) { + Cache = SmallVector<SUnit *, 4>(); + } + } + + virtual ~InstructionRule() = default; +}; + typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap; // Classify instructions into groups to enable fine tuned control over the @@ -102,11 +133,12 @@ private: // SGID is used to map instructions to candidate SchedGroups unsigned SGID; + // The different rules each instruction in this SchedGroup must conform to + SmallVector<std::shared_ptr<InstructionRule>, 4> Rules; + // Count of the number of created SchedGroups, used to initialize SGID. static unsigned NumSchedGroups; - ScheduleDAGInstrs *DAG; - const SIInstrInfo *TII; // Try to add and edge from SU A to SU B. @@ -120,6 +152,8 @@ public: // Collection of SUnits that are classified as members of this group. SmallVector<SUnit *, 32> Collection; + ScheduleDAGInstrs *DAG; + // Returns true if SU can be added to this SchedGroup. bool canAddSU(SUnit &SU) const; @@ -145,6 +179,28 @@ public: // Returns true if no more instructions may be added to this group. bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; } + // Append a constraint that SUs must meet in order to fit into this + // SchedGroup. Since many rules involve the relationship between a SchedGroup + // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve + // time (rather than SchedGroup init time.) + void addRule(std::shared_ptr<InstructionRule> NewRule) { + Rules.push_back(NewRule); + } + + // Returns true if the SU matches all rules + bool allowedByRules(const SUnit *SU, + SmallVectorImpl<SchedGroup> &SyncPipe) const { + if (Rules.empty()) + return true; + for (size_t I = 0; I < Rules.size(); I++) { + auto TheRule = Rules[I].get(); + if (!TheRule->apply(SU, Collection, SyncPipe)) { + return false; + } + } + return true; + } + // Add SU to the SchedGroup. void add(SUnit &SU) { LLVM_DEBUG(dbgs() << "For SchedGroup with mask " @@ -177,13 +233,13 @@ public: SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) - : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) { + : SGMask(SGMask), MaxSize(MaxSize), TII(TII), DAG(DAG) { SGID = NumSchedGroups++; } SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID, ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) - : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) { + : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), TII(TII), DAG(DAG) { SGID = NumSchedGroups++; } }; @@ -254,6 +310,9 @@ class PipelineSolver { // How many branches we have explored uint64_t BranchesExplored = 0; + // The direction in which we process the candidate SchedGroups per SU + bool IsBottomUp = 1; + // Update indices to fit next conflicting instruction void advancePosition(); // Recede indices to attempt to find better fit for previous conflicting @@ -264,19 +323,35 @@ class PipelineSolver { bool solveExact(); // The polynomial time algorithm which attempts to find a good fit bool solveGreedy(); + // Find the best SchedGroup for the current SU using the heuristic given all + // current information. One step in the greedy algorithm. Templated against + // the SchedGroup iterator (either reverse or forward). + template <typename T> + void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, + T E); // Whether or not the current solution is optimal bool checkOptimal(); // Populate the ready list, prioiritizing fewest missed edges first - void populateReadyList(SUToCandSGsPair &CurrSU, - SmallVectorImpl<std::pair<int, int>> &ReadyList, - SmallVectorImpl<SchedGroup> &SyncPipeline); + // Templated against the SchedGroup iterator (either reverse or forward). + template <typename T> + void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, + T E); // Add edges corresponding to the SchedGroups as assigned by solver void makePipeline(); + // Link the SchedGroups in the best found pipeline. + // Tmplated against the SchedGroup iterator (either reverse or forward). + template <typename T> void linkSchedGroups(T I, T E); // Add the edges from the SU to the other SchedGroups in pipeline, and // return the number of edges missed. int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); - // Remove the edges passed via AddedEdges + // Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It + // returns the cost (in terms of missed pipeline edges), and tracks the edges + // added in \p AddedEdges + template <typename T> + int linkSUnit(SUnit *SU, int SGID, + std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E); + // Remove the edges passed via \p AddedEdges void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); // Convert the passed in maps to arrays for bidirectional iterators void convertSyncMapsToArrays(); @@ -290,9 +365,9 @@ public: PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, - ScheduleDAGMI *DAG) + ScheduleDAGMI *DAG, bool IsBottomUp = 1) : DAG(DAG), SyncedInstrs(SyncedInstrs), - SyncedSchedGroups(SyncedSchedGroups) { + SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) { for (auto &PipelineInstrs : SyncedInstrs) { if (PipelineInstrs.second.size() > 0) { @@ -363,14 +438,28 @@ void PipelineSolver::convertSyncMapsToArrays() { } } +template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) { + for (; I != E; ++I) { + auto &GroupA = *I; + for (auto J = std::next(I); J != E; ++J) { + auto &GroupB = *J; + GroupA.link(GroupB); + } + } +} + void PipelineSolver::makePipeline() { // Preserve the order of barrier for subsequent SchedGroupBarrier mutations for (auto &SyncPipeline : BestPipeline) { + LLVM_DEBUG(dbgs() << "Printing SchedGroups\n"); for (auto &SG : SyncPipeline) { + LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID() + << " has: \n"); SUnit *SGBarr = nullptr; for (auto &SU : SG.Collection) { if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) SGBarr = SU; + LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n"); } // Command line requested IGroupLP doesn't have SGBarr if (!SGBarr) @@ -381,43 +470,47 @@ void PipelineSolver::makePipeline() { } for (auto &SyncPipeline : BestPipeline) { - auto I = SyncPipeline.rbegin(); - auto E = SyncPipeline.rend(); - for (; I != E; ++I) { - auto &GroupA = *I; - for (auto J = std::next(I); J != E; ++J) { - auto &GroupB = *J; - GroupA.link(GroupB); - } - } + IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend()) + : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end()); } } -int PipelineSolver::addEdges( - SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, - std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { - int AddedCost = 0; +template <typename T> +int PipelineSolver::linkSUnit( + SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, + T I, T E) { bool MakePred = false; - - // The groups in the pipeline are in reverse order. Thus, - // by traversing them from last to first, we are traversing - // them in the order as they were introduced in the code. After we - // pass the group the SU is being assigned to, it should be - // linked as a predecessor of the subsequent SchedGroups - auto GroupNo = (int)SyncPipeline.size() - 1; - for (; GroupNo >= 0; GroupNo--) { - if (SyncPipeline[GroupNo].getSGID() == SGID) { + int AddedCost = 0; + for (; I < E; ++I) { + if (I->getSGID() == SGID) { MakePred = true; continue; } - auto Group = &SyncPipeline[GroupNo]; - AddedCost += Group->link(*SU, MakePred, AddedEdges); + auto Group = *I; + AddedCost += Group.link(*SU, MakePred, AddedEdges); assert(AddedCost >= 0); } - return AddedCost; } +int PipelineSolver::addEdges( + SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, + std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { + + // For IsBottomUp, the first SchedGroup in SyncPipeline contains the + // instructions that are the ultimate successors in the resultant mutation. + // Therefore, in such a configuration, the SchedGroups occurring before the + // candidate SGID are successors of the candidate SchedGroup, thus the current + // SU should be linked as a predecessor to SUs in those SchedGroups. The + // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple + // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using + // IsBottomUp (in reverse). + return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(), + SyncPipeline.rend()) + : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(), + SyncPipeline.end()); +} + void PipelineSolver::removeEdges( const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) { // Only remove the edges that we have added when testing @@ -490,12 +583,13 @@ bool PipelineSolver::checkOptimal() { return (DoneExploring || BestCost == 0); } +template <typename T> void PipelineSolver::populateReadyList( - SUToCandSGsPair &CurrSU, SmallVectorImpl<std::pair<int, int>> &ReadyList, - SmallVectorImpl<SchedGroup> &SyncPipeline) { + SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) { + SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; + auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; assert(CurrSU.second.size() >= 1); - auto I = CurrSU.second.rbegin(); - auto E = CurrSU.second.rend(); + for (; I != E; ++I) { std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; int CandSGID = *I; @@ -545,7 +639,10 @@ bool PipelineSolver::solveExact() { // SchedGroup -> Cost pairs SmallVector<std::pair<int, int>, 4> ReadyList; // Prioritize the candidate sched groups in terms of lowest cost first - populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]); + IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(), + CurrSU.second.rend()) + : populateReadyList(ReadyList, CurrSU.second.begin(), + CurrSU.second.end()); auto I = ReadyList.begin(); auto E = ReadyList.end(); @@ -569,6 +666,9 @@ bool PipelineSolver::solveExact() { if (Match->isFull()) continue; + if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) + continue; + LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask " << (int)Match->getMask() << "and ID " << CandSGID << "\n"); @@ -620,64 +720,75 @@ bool PipelineSolver::solveExact() { return FinishedExploring; } -bool PipelineSolver::solveGreedy() { - BestCost = 0; - std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; - - while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) { - SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; - int BestNodeCost = -1; - int TempCost; - SchedGroup *BestGroup = nullptr; - int BestGroupID = -1; - auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; - LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum - << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); +template <typename T> +void PipelineSolver::greedyFind( + std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) { + SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; + int BestNodeCost = -1; + int TempCost; + SchedGroup *BestGroup = nullptr; + int BestGroupID = -1; + auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; + LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum + << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); - // Since we have added the potential SchedGroups from bottom up, but - // traversed the DAG from top down, parse over the groups from last to - // first. If we fail to do this for the greedy algorithm, the solution will - // likely not be good in more complex cases. - auto I = CurrSU.second.rbegin(); - auto E = CurrSU.second.rend(); - for (; I != E; ++I) { - std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; - int CandSGID = *I; - SchedGroup *Match; - for (auto &SG : SyncPipeline) { - if (SG.getSGID() == CandSGID) - Match = &SG; - } + // Since we have added the potential SchedGroups from bottom up, but + // traversed the DAG from top down, parse over the groups from last to + // first. If we fail to do this for the greedy algorithm, the solution will + // likely not be good in more complex cases. + for (; I != E; ++I) { + std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; + int CandSGID = *I; + SchedGroup *Match; + for (auto &SG : SyncPipeline) { + if (SG.getSGID() == CandSGID) + Match = &SG; + } - LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask " - << (int)Match->getMask() << "\n"); + LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask " + << (int)Match->getMask() << "\n"); - if (Match->isFull()) { - LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n"); - continue; - } - TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); - LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n"); - if (TempCost < BestNodeCost || BestNodeCost == -1) { - BestGroup = Match; - BestNodeCost = TempCost; - BestGroupID = CandSGID; - } - removeEdges(AddedEdges); - if (BestNodeCost == 0) - break; + if (Match->isFull()) { + LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n"); + continue; + } + if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) { + LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n"); + continue; } + TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); + LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n"); + if (TempCost < BestNodeCost || BestNodeCost == -1) { + BestGroup = Match; + BestNodeCost = TempCost; + BestGroupID = CandSGID; + } + removeEdges(AddedEdges); + if (BestNodeCost == 0) + break; + } - if (BestGroupID != -1) { - BestGroup->add(*CurrSU.first); - addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges); - LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask" - << (int)BestGroup->getMask() << "\n"); - BestCost += TempCost; - } else - BestCost += MissPenalty; + if (BestGroupID != -1) { + BestGroup->add(*CurrSU.first); + addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges); + LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask" + << (int)BestGroup->getMask() << "\n"); + BestCost += TempCost; + } else + BestCost += MissPenalty; - CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; + CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; +} + +bool PipelineSolver::solveGreedy() { + BestCost = 0; + std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; + + while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) { + SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; + IsBottomUp + ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend()) + : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end()); advancePosition(); } BestPipeline = CurrPipeline; @@ -721,9 +832,14 @@ void PipelineSolver::solve() { } makePipeline(); + LLVM_DEBUG(dbgs() << "After applying mutation\n"); + LLVM_DEBUG(DAG->dump()); } -enum IGLPStrategyID : int { MFMASmallGemmOptID = 0 }; +enum IGLPStrategyID : int { + MFMASmallGemmOptID = 0, + MFMASmallGemmSingleWaveOptID = 1, +}; // Implement a IGLP scheduling strategy. class IGLPStrategy { @@ -741,6 +857,8 @@ public: // Returns true if this strategy should be applied to a ScheduleDAG. virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0; + bool IsBottomUp = 1; + IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) : DAG(DAG), TII(TII) {} @@ -748,6 +866,7 @@ public: }; class MFMASmallGemmOpt final : public IGLPStrategy { +private: public: void applyIGLPStrategy( DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, @@ -756,7 +875,9 @@ public: bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) - : IGLPStrategy(DAG, TII) {} + : IGLPStrategy(DAG, TII) { + IsBottomUp = 1; + } }; void MFMASmallGemmOpt::applyIGLPStrategy( @@ -781,12 +902,456 @@ void MFMASmallGemmOpt::applyIGLPStrategy( } } +class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { +private: + // Whether the DS_READ is a predecessor of first four MFMA in region + class EnablesInitialMFMA final : public InstructionRule { + public: + bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, + SmallVectorImpl<SchedGroup> &SyncPipe) override { + if (!SyncPipe.size()) + return false; + int MFMAsFound = 0; + if (!Cache->size()) { + for (auto &Elt : SyncPipe[0].DAG->SUnits) { + if (TII->isMFMAorWMMA(*Elt.getInstr())) { + ++MFMAsFound; + if (MFMAsFound > 4) + break; + Cache->push_back(&Elt); + } + } + } + + assert(Cache->size()); + auto DAG = SyncPipe[0].DAG; + for (auto &Elt : *Cache) { + if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU))) + return true; + } + return false; + } + + EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID, + bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache) {} + }; + + // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE + class IsPermForDSW final : public InstructionRule { + public: + bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, + SmallVectorImpl<SchedGroup> &SyncPipe) override { + auto MI = SU->getInstr(); + if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64) + return false; + + bool FitsInGroup = false; + // Does the VALU have a DS_WRITE successor + if (!Collection.size()) { + for (auto &Succ : SU->Succs) { + SUnit *SuccUnit = Succ.getSUnit(); + if (TII->isDS(*SuccUnit->getInstr()) && + SuccUnit->getInstr()->mayStore()) { + Cache->push_back(SuccUnit); + FitsInGroup = true; + } + } + return FitsInGroup; + } + + assert(Cache->size()); + + // Does the VALU have a DS_WRITE successor that is the same as other + // VALU already in the group. The V_PERMs will all share 1 DS_W succ + return std::any_of(Cache->begin(), Cache->end(), [&SU](SUnit *Elt) { + return std::any_of(SU->Succs.begin(), SU->Succs.end(), + [&Elt](const SDep &ThisSucc) { + return ThisSucc.getSUnit() == Elt; + }); + }); + } + + IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache) {} + }; + + // Whether the SU is a successor of any element in previous SchedGroup + class IsSuccOfPrevGroup final : public InstructionRule { + public: + bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, + SmallVectorImpl<SchedGroup> &SyncPipe) override { + SchedGroup *OtherGroup = nullptr; + for (auto &PipeSG : SyncPipe) { + if ((unsigned)PipeSG.getSGID() == SGID - 1) { + OtherGroup = &PipeSG; + } + } + + if (!OtherGroup) + return false; + if (!OtherGroup->Collection.size()) + return true; + + // Does the previous VALU have this DS_Write as a successor + return (std::any_of(OtherGroup->Collection.begin(), + OtherGroup->Collection.end(), [&SU](SUnit *Elt) { + return std::any_of(Elt->Succs.begin(), + Elt->Succs.end(), + [&SU](SDep &Succ) { + return Succ.getSUnit() == SU; + }); + })); + } + IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID, + bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache) {} + }; + + // Whether the combined load width of group is 128 bits + class VMEMSize final : public InstructionRule { + public: + bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, + SmallVectorImpl<SchedGroup> &SyncPipe) override { + auto MI = SU->getInstr(); + if (MI->getOpcode() == TargetOpcode::BUNDLE) + return false; + if (!Collection.size()) + return true; + + int NumBits = 0; + + auto TRI = TII->getRegisterInfo(); + auto &MRI = MI->getParent()->getParent()->getRegInfo(); + for (auto &Elt : Collection) { + auto Op = Elt->getInstr()->getOperand(0); + auto Size = + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op)); + NumBits += Size; + } + + if (NumBits < 128) { + assert(TII->isVMEM(*MI) && MI->mayLoad()); + if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg( + MRI, MI->getOperand(0))) <= + 128) + return true; + } + + return false; + } + + VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache) {} + }; + + // Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup + // that is /p Distance steps away + class SharesPredWithPrevNthGroup final : public InstructionRule { + private: + unsigned Distance = 1; + + public: + bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, + SmallVectorImpl<SchedGroup> &SyncPipe) override { + SchedGroup *OtherGroup = nullptr; + if (!SyncPipe.size()) + return false; + + if (!Cache->size()) { + + for (auto &PipeSG : SyncPipe) { + if ((unsigned)PipeSG.getSGID() == SGID - Distance) { + OtherGroup = &PipeSG; + } + } + + if (!OtherGroup) + return false; + if (!OtherGroup->Collection.size()) + return true; + + for (auto &OtherEle : OtherGroup->Collection) { + for (auto &Pred : OtherEle->Preds) { + if (Pred.getSUnit()->getInstr()->getOpcode() == + AMDGPU::V_PERM_B32_e64) + Cache->push_back(Pred.getSUnit()); + } + } + } + + assert(Cache->size()); + auto DAG = SyncPipe[0].DAG; + // Does the previous DS_WRITE share a V_PERM predecessor with this + // VMEM_READ + return ( + std::any_of(Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *Elt) { + return DAG->IsReachable(const_cast<SUnit *>(SU), Elt); + })); + } + SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, + unsigned SGID, bool NeedsCache = false) + : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} + }; + +public: + void applyIGLPStrategy( + DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, + DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override; + + bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } + + MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) + : IGLPStrategy(DAG, TII) { + IsBottomUp = 0; + } +}; + +void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( + DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, + DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) { + unsigned MFMACount = 0; + unsigned DSWCount = 0; + unsigned DSWWithPermCount = 0; + unsigned DSWWithSharedVMEMCount = 0; + unsigned DSRCount = 0; + SmallVector<SUnit *, 6> DSWithPerms; + for (auto &SU : DAG->SUnits) { + auto I = SU.getInstr(); + if (TII->isMFMAorWMMA(*I)) + ++MFMACount; + else if (TII->isDS(*I)) { + if (I->mayLoad()) + ++DSRCount; + else if (I->mayStore()) { + ++DSWCount; + for (auto Pred : SU.Preds) { + if (Pred.getSUnit()->getInstr()->getOpcode() == + AMDGPU::V_PERM_B32_e64) { + DSWithPerms.push_back(&SU); + break; + } + } + } + } + } + DSWWithPermCount = DSWithPerms.size(); + auto I = DSWithPerms.begin(); + auto E = DSWithPerms.end(); + + // Get the count of DS_WRITES with V_PERM predecessors which + // have loop carried dependencies (WAR) on the same VMEM_READs. + // We consider partial overlap as a miss -- in other words, + // for a given DS_W, we only consider another DS_W as matching + // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred + // for every V_PERM pred of this DS_W. + DenseMap<MachineInstr *, SUnit *> VMEMLookup; + SmallVector<SUnit *, 6> Counted; + for (; I != E; I++) { + SUnit *Cand = nullptr; + bool MissedAny = false; + for (auto &Pred : (*I)->Preds) { + if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64) + continue; + + if (Cand && + std::find(Counted.begin(), Counted.end(), Cand) != Counted.end()) + break; + + for (auto &Succ : Pred.getSUnit()->Succs) { + auto MI = Succ.getSUnit()->getInstr(); + if (!TII->isVMEM(*MI) || !MI->mayLoad()) + continue; + + if (MissedAny || !VMEMLookup.size()) { + MissedAny = true; + VMEMLookup[MI] = *I; + continue; + } + + if (!VMEMLookup.contains(MI)) { + MissedAny = true; + VMEMLookup[MI] = *I; + continue; + } + + Cand = VMEMLookup[MI]; + if (std::find(Counted.begin(), Counted.end(), Cand) != Counted.end()) { + MissedAny = true; + break; + } + } + } + if (!MissedAny && Cand) { + DSWWithSharedVMEMCount += 2; + Counted.push_back(Cand); + Counted.push_back(*I); + } + } + + assert(DSWWithSharedVMEMCount <= DSWWithPermCount); + SchedGroup *SG; + unsigned PipelineSyncID = 0; + // For kernels with V_PERM, there are enough VALU to mix in between MFMAs + if (DSWWithPermCount) { + for (unsigned I = 0; I < MFMACount; I++) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + } + + PipelineSyncID = 1; + // Phase 1: Break up DS_READ and MFMA clusters. + // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ + // prefetch + + // Make ready initial MFMA + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + // Interleave MFMA with DS_READ prefetch + for (unsigned I = 0; I < DSRCount - 4; ++I) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + // Phase 2a: Loop carried dependency with V_PERM + // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they + // depend on. Interleave MFMA to keep XDL unit busy throughout. + for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( + 1, TII, SG->getSGID(), true)); + SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( + 3, TII, SG->getSGID(), true)); + SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + // Phase 2b: Loop carried dependency without V_PERM + // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on. + // Interleave MFMA to keep XDL unit busy throughout. + for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are + // ultimately used by two DS_WRITE + // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they + // depend on. Interleave MFMA to keep XDL unit busy throughout. + + for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) { + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( + 2, TII, SG->getSGID(), true)); + SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); + SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( + 4, TII, SG->getSGID(), true)); + SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false)); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } +} + static std::unique_ptr<IGLPStrategy> createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) { switch (ID) { case MFMASmallGemmOptID: return std::make_unique<MFMASmallGemmOpt>(DAG, TII); + case MFMASmallGemmSingleWaveOptID: + return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII); } llvm_unreachable("Unknown IGLPStrategyID"); @@ -829,6 +1394,13 @@ private: public: void apply(ScheduleDAGInstrs *DAGInstrs) override; + // The order in which the PipelineSolver should process the candidate + // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last + // created SchedGroup first, and will consider that as the ultimate + // predecessor group when linking. TOP_DOWN instead links and processes the + // first created SchedGroup first. + bool IsBottomUp = 1; + IGroupLPDAGMutation() = default; }; @@ -908,6 +1480,7 @@ int SchedGroup::link(SUnit &SU, bool MakePred, if (DAG->IsReachable(B, A)) continue; + // tryAddEdge returns false if there is a dependency that makes adding // the A->B edge impossible, otherwise it returns true; bool Added = tryAddEdge(A, B); @@ -1034,7 +1607,7 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { } if (foundSB || foundIGLP) { - PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG); + PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp); // PipelineSolver performs the mutation by adding the edges it // determined as the best PS.solve(); @@ -1114,8 +1687,10 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { IGLPStrategyID StrategyID = (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm(); auto S = createIGLPStrategy(StrategyID, DAG, TII); - if (S->shouldApplyStrategy(DAG)) + if (S->shouldApplyStrategy(DAG)) { + IsBottomUp = S->IsBottomUp; S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups); + } } } // namespace diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 42d1f58e4239..825c6f0acd0f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -20,7 +20,7 @@ #include "MCTargetDesc/R600MCTargetDesc.h" #include "R600RegisterInfo.h" #include "SIMachineFunctionInfo.h" -#include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/SelectionDAG.h" @@ -28,6 +28,7 @@ #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/InitializePasses.h" +#include "llvm/Support/ErrorHandling.h" #ifdef EXPENSIVE_CHECKS #include "llvm/Analysis/LoopInfo.h" @@ -101,7 +102,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel", "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo) INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis) -INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) #ifdef EXPENSIVE_CHECKS INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) @@ -131,7 +132,7 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { } #endif Subtarget = &MF.getSubtarget<GCNSubtarget>(); - Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction()); + Mode = SIModeRegisterDefaults(MF.getFunction()); return SelectionDAGISel::runOnMachineFunction(MF); } @@ -167,6 +168,7 @@ bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const { case ISD::FFLOOR: case ISD::FMINNUM: case ISD::FMAXNUM: + case ISD::FLDEXP: case AMDGPUISD::FRACT: case AMDGPUISD::CLAMP: case AMDGPUISD::COS_HW: @@ -178,7 +180,6 @@ bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const { case AMDGPUISD::RCP: case AMDGPUISD::RSQ: case AMDGPUISD::RCP_IFLAG: - case AMDGPUISD::LDEXP: // On gfx10, all 16-bit instructions preserve the high bits. return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9; case ISD::FP_ROUND: @@ -199,7 +200,7 @@ bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const { void AMDGPUDAGToDAGISel::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<AMDGPUArgumentUsageInfo>(); - AU.addRequired<LegacyDivergenceAnalysis>(); + AU.addRequired<UniformityInfoWrapperPass>(); #ifdef EXPENSIVE_CHECKS AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); @@ -503,10 +504,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { // isa<MemSDNode> almost works but is slightly too permissive for some DS // intrinsics. if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) || - (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC || - Opc == ISD::ATOMIC_LOAD_FADD || - Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || - Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) { + Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || + Opc == AMDGPUISD::ATOMIC_LOAD_FMAX) { N = glueCopyToM0LDSInit(N); SelectCode(N); return; @@ -528,8 +527,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectADD_SUB_I64(N); return; } - case ISD::ADDCARRY: - case ISD::SUBCARRY: + case ISD::UADDO_CARRY: + case ISD::USUBO_CARRY: if (N->getValueType(0) != MVT::i32) break; @@ -665,10 +664,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::BRCOND: SelectBRCOND(N); return; - case ISD::FMAD: - case ISD::FMA: - SelectFMAD_FMA(N); - return; case AMDGPUISD::CVT_PKRTZ_F16_F32: case AMDGPUISD::CVT_PKNORM_I16_F32: case AMDGPUISD::CVT_PKNORM_U16_F32: @@ -714,11 +709,11 @@ bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N, assert(N->getOpcode() == ISD::AND); const APInt &RHS = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue(); - if (RHS.countTrailingOnes() >= ShAmtBits) + if (RHS.countr_one() >= ShAmtBits) return true; const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero; - return (LHSKnownZeros | RHS).countTrailingOnes() >= ShAmtBits; + return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits; } static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, @@ -813,7 +808,7 @@ SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val, return SDValue(Mov, 0); } -// FIXME: Should only handle addcarry/subcarry +// FIXME: Should only handle uaddo_carry/usubo_carry void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { SDLoc DL(N); SDValue LHS = N->getOperand(0); @@ -890,15 +885,15 @@ void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) { SDValue CI = N->getOperand(2); if (N->isDivergent()) { - unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64 - : AMDGPU::V_SUBB_U32_e64; + unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64 + : AMDGPU::V_SUBB_U32_e64; CurDAG->SelectNodeTo( N, Opc, N->getVTList(), {LHS, RHS, CI, CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/}); } else { - unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::S_ADD_CO_PSEUDO - : AMDGPU::S_SUB_CO_PSEUDO; + unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO + : AMDGPU::S_SUB_CO_PSEUDO; CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI}); } } @@ -913,8 +908,8 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E; ++UI) if (UI.getUse().getResNo() == 1) { - if ((IsAdd && (UI->getOpcode() != ISD::ADDCARRY)) || - (!IsAdd && (UI->getOpcode() != ISD::SUBCARRY))) { + if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) || + (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) { IsVALU = true; break; } @@ -1141,6 +1136,15 @@ bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0, return CurDAG->SignBitIsZero(Base); } +bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Base, + uint64_t FlatVariant) const { + if (FlatVariant != SIInstrFlags::FlatScratch) + return true; + // When value in 32-bit Base can be negative calculate scratch offset using + // 32-bit add instruction, otherwise use Base(unsigned) + offset. + return CurDAG->SignBitIsZero(Base); +} + // TODO: If offset is too big, put low 16-bit into offset. bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, SDValue &Offset0, @@ -1283,7 +1287,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr, Ptr = N2; VAddr = N3; } - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); } else if (N0->isDivergent()) { // N0 is divergent. Use it as the addr64, and construct the resource from a // 0 address. @@ -1299,18 +1303,18 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr, if (!C1) { // No offset. - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); return true; } if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) { // Legal offset for instruction. - Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32); return true; } // Illegal offset, store it in soffset. - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); SOffset = SDValue(CurDAG->getMachineNode( AMDGPU::S_MOV_B32, DL, MVT::i32, @@ -1377,13 +1381,15 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS); // Don't fold null pointer. if (Imm != NullPtr) { - SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32); + const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(); + SDValue HighBits = + CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32); MachineSDNode *MovHighBits = CurDAG->getMachineNode( AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits); VAddr = SDValue(MovHighBits, 0); SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); - ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16); + ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32); return true; } } @@ -1414,14 +1420,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, (!Subtarget->privateMemoryResourceIsRangeChecked() || CurDAG->SignBitIsZero(N0))) { std::tie(VAddr, SOffset) = foldFrameIndex(N0); - ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); + ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32); return true; } } // (node) std::tie(VAddr, SOffset) = foldFrameIndex(Addr); - ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); + ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); return true; } @@ -1450,7 +1456,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, if (IsCopyFromSGPR(*TRI, Addr)) { SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); SOffset = Addr; - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); return true; } @@ -1474,7 +1480,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); - Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); + Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32); return true; } @@ -1532,7 +1538,8 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr, if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) { SDValue N0, N1; - if (isBaseWithConstantOffset64(Addr, N0, N1)) { + if (isBaseWithConstantOffset64(Addr, N0, N1) && + isFlatScratchBaseLegal(N0, FlatVariant)) { int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); @@ -1764,7 +1771,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr, int64_t COffsetVal = 0; - if (CurDAG->isBaseWithConstantOffset(Addr)) { + if (CurDAG->isBaseWithConstantOffset(Addr) && + isFlatScratchBaseLegal(Addr.getOperand(0))) { COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue(); SAddr = Addr.getOperand(0); } else { @@ -1842,6 +1850,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); VAddr = SDValue(VMov, 0); SAddr = LHS; + if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr)) + return false; if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset)) return false; Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16); @@ -1866,6 +1876,9 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, return false; } + if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr)) + return false; + if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset)) return false; SAddr = SelectSAddrFI(CurDAG, SAddr); @@ -2283,52 +2296,6 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { VCC.getValue(0)); } -void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) { - MVT VT = N->getSimpleValueType(0); - bool IsFMA = N->getOpcode() == ISD::FMA; - if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() && - !Subtarget->hasFmaMixInsts()) || - ((IsFMA && Subtarget->hasMadMixInsts()) || - (!IsFMA && Subtarget->hasFmaMixInsts()))) { - SelectCode(N); - return; - } - - SDValue Src0 = N->getOperand(0); - SDValue Src1 = N->getOperand(1); - SDValue Src2 = N->getOperand(2); - unsigned Src0Mods, Src1Mods, Src2Mods; - - // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand - // using the conversion from f16. - bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods); - bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods); - bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods); - - assert((IsFMA || !Mode.allFP32Denormals()) && - "fmad selected with denormals enabled"); - // TODO: We can select this with f32 denormals enabled if all the sources are - // converted from f16 (in which case fmad isn't legal). - - if (Sel0 || Sel1 || Sel2) { - // For dummy operands. - SDValue Zero = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); - SDValue Ops[] = { - CurDAG->getTargetConstant(Src0Mods, SDLoc(), MVT::i32), Src0, - CurDAG->getTargetConstant(Src1Mods, SDLoc(), MVT::i32), Src1, - CurDAG->getTargetConstant(Src2Mods, SDLoc(), MVT::i32), Src2, - CurDAG->getTargetConstant(0, SDLoc(), MVT::i1), - Zero, Zero - }; - - CurDAG->SelectNodeTo(N, - IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32, - MVT::f32, Ops); - } else { - SelectCode(N); - } -} - void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) { // The address is assumed to be uniform, so if it ends up in a VGPR, it will // be copied to an SGPR with readfirstlane. @@ -2562,6 +2529,18 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) { case Intrinsic::amdgcn_interp_p1_f16: SelectInterpP1F16(N); return; + case Intrinsic::amdgcn_inverse_ballot: + switch (N->getOperand(1).getValueSizeInBits()) { + case 32: + Opcode = AMDGPU::S_INVERSE_BALLOT_U32; + break; + case 64: + Opcode = AMDGPU::S_INVERSE_BALLOT_U64; + break; + default: + llvm_unreachable("Unsupported size for inverse ballot mask."); + } + break; default: SelectCode(N); return; @@ -2591,13 +2570,22 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) { bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &Mods, + bool IsCanonicalizing, bool AllowAbs) const { - Mods = 0; + Mods = SISrcMods::NONE; Src = In; if (Src.getOpcode() == ISD::FNEG) { Mods |= SISrcMods::NEG; Src = Src.getOperand(0); + } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) { + // Fold fsub [+-]0 into fneg. This may not have folded depending on the + // denormal mode, but we're implicitly canonicalizing in a source operand. + auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0)); + if (LHS && LHS->isZero()) { + Mods |= SISrcMods::NEG; + Src = Src.getOperand(1); + } } if (AllowAbs && Src.getOpcode() == ISD::FABS) { @@ -2611,7 +2599,20 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods; - if (SelectVOP3ModsImpl(In, Src, Mods)) { + if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true, + /*AllowAbs=*/true)) { + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; + } + + return false; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing( + SDValue In, SDValue &Src, SDValue &SrcMods) const { + unsigned Mods; + if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false, + /*AllowAbs=*/true)) { SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; } @@ -2622,7 +2623,9 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods; - if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) { + if (SelectVOP3ModsImpl(In, Src, Mods, + /*IsCanonicalizing=*/true, + /*AllowAbs=*/false)) { SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; } @@ -2642,7 +2645,9 @@ bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src, SDValue &SrcMods, bool OpSel) const { unsigned Mods; - if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) { + if (SelectVOP3ModsImpl(In, Src, Mods, + /*IsCanonicalizing=*/true, + /*AllowAbs=*/false)) { if (OpSel) Mods |= SISrcMods::OP_SEL_0; SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); @@ -2695,9 +2700,10 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src, bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods, bool IsDOT) const { - unsigned Mods = 0; + unsigned Mods = SISrcMods::NONE; Src = In; + // TODO: Handle G_FSUB 0 as fneg if (Src.getOpcode() == ISD::FNEG) { Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); Src = Src.getOperand(0); @@ -2776,7 +2782,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF() .bitcastToAPInt().getZExtValue(); if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) { - Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);; + Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64); SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; } @@ -2804,7 +2810,7 @@ bool AMDGPUDAGToDAGISel::SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const { assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value"); unsigned Mods = SISrcMods::OP_SEL_1; - unsigned SrcSign = C->getAPIntValue().getZExtValue(); + unsigned SrcSign = C->getZExtValue(); if (SrcSign == 1) Mods ^= SISrcMods::NEG; @@ -2818,7 +2824,7 @@ bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In, assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value"); unsigned Mods = SISrcMods::OP_SEL_1; - unsigned SrcVal = C->getAPIntValue().getZExtValue(); + unsigned SrcVal = C->getZExtValue(); if (SrcVal == 1) Mods |= SISrcMods::OP_SEL_0; @@ -2883,6 +2889,15 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, return false; } +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + if (!SelectVOP3PMadMixModsImpl(In, Src, Mods)) + return false; + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 162b0340a6aa..0605baf3a0cc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -16,6 +16,7 @@ #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" +#include "SIModeRegisterDefaults.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/Target/TargetMachine.h" @@ -24,11 +25,7 @@ using namespace llvm; namespace { static inline bool isNullConstantOrUndef(SDValue V) { - if (V.isUndef()) - return true; - - ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V); - return Const != nullptr && Const->isZero(); + return V.isUndef() || isNullConstant(V); } static inline bool getConstantValue(SDValue N, uint32_t &Out) { @@ -82,7 +79,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { const GCNSubtarget *Subtarget; // Default FP mode for the current function. - AMDGPU::SIModeRegisterDefaults Mode; + SIModeRegisterDefaults Mode; bool EnableLateStructurizeCFG; @@ -157,6 +154,9 @@ private: bool isDSOffsetLegal(SDValue Base, unsigned Offset) const; bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1, unsigned Size) const; + bool isFlatScratchBaseLegal( + SDValue Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const; + bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, SDValue &Offset1) const; @@ -216,8 +216,11 @@ private: bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods, + bool IsCanonicalizing = true, bool AllowAbs = true) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3ModsNonCanonicalizing(SDValue In, SDValue &Src, + SDValue &SrcMods) const; bool SelectVOP3BMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, @@ -247,6 +250,8 @@ private: bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const; + bool SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, + SDValue &SrcMods) const; bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; SDValue getHi16Elt(SDValue In) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 8121b381e83f..254d02d4ce5b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -16,12 +16,13 @@ #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" #include "AMDGPUMachineFunction.h" -#include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/KnownBits.h" #include "llvm/Target/TargetMachine.h" @@ -138,6 +139,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v16f64, Promote); AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32); + setOperationAction(ISD::LOAD, MVT::i128, Promote); + AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32); + // There are no 64-bit extloads. These should be done as a 32-bit extload and // an extension to 64-bit. for (MVT VT : MVT::integer_valuetypes()) @@ -264,6 +268,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v16f64, Promote); AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32); + setOperationAction(ISD::STORE, MVT::i128, Promote); + AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32); + setTruncStoreAction(MVT::i64, MVT::i1, Expand); setTruncStoreAction(MVT::i64, MVT::i8, Expand); setTruncStoreAction(MVT::i64, MVT::i16, Expand); @@ -321,14 +328,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // Library functions. These default to Expand, but we have instructions // for them. - setOperationAction({ISD::FCEIL, ISD::FEXP2, ISD::FPOW, ISD::FLOG2, ISD::FABS, - ISD::FFLOOR, ISD::FRINT, ISD::FTRUNC, ISD::FMINNUM, - ISD::FMAXNUM}, + setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR, ISD::FRINT, + ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal); + setOperationAction(ISD::FLOG2, MVT::f32, Custom); setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom); - setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP}, MVT::f32, Custom); + setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2}, MVT::f32, + Custom); setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom); @@ -338,8 +346,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, if (Subtarget->has16BitInsts()) setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal); - else + else { setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal); + setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom); + } + + setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP}, MVT::f16, Custom); // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches // scalarization code. Can be removed when IS_FPCLASS expand isn't called by @@ -556,7 +568,7 @@ bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const { //===----------------------------------------------------------------------===// LLVM_READNONE -static bool fnegFoldsIntoOp(unsigned Opc) { +static bool fnegFoldsIntoOpcode(unsigned Opc) { switch (Opc) { case ISD::FADD: case ISD::FSUB: @@ -567,6 +579,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) { case ISD::FMAXNUM: case ISD::FMINNUM_IEEE: case ISD::FMAXNUM_IEEE: + case ISD::SELECT: case ISD::FSIN: case ISD::FTRUNC: case ISD::FRINT: @@ -582,17 +595,45 @@ static bool fnegFoldsIntoOp(unsigned Opc) { case AMDGPUISD::FMED3: // TODO: handle llvm.amdgcn.fma.legacy return true; + case ISD::BITCAST: + llvm_unreachable("bitcast is special cased"); default: return false; } } +static bool fnegFoldsIntoOp(const SDNode *N) { + unsigned Opc = N->getOpcode(); + if (Opc == ISD::BITCAST) { + // TODO: Is there a benefit to checking the conditions performFNegCombine + // does? We don't for the other cases. + SDValue BCSrc = N->getOperand(0); + if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) { + return BCSrc.getNumOperands() == 2 && + BCSrc.getOperand(1).getValueSizeInBits() == 32; + } + + return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32; + } + + return fnegFoldsIntoOpcode(Opc); +} + /// \p returns true if the operation will definitely need to use a 64-bit /// encoding, and thus will use a VOP3 encoding regardless of the source /// modifiers. LLVM_READONLY static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) { - return N->getNumOperands() > 2 || VT == MVT::f64; + return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) || + VT == MVT::f64; +} + +/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the +/// type for ISD::SELECT. +LLVM_READONLY +static bool selectSupportsSourceMods(const SDNode *N) { + // TODO: Only applies if select will be vector + return N->getValueType(0) == MVT::f32; } // Most FP instructions support source modifiers, but this could be refined @@ -604,7 +645,6 @@ static bool hasSourceMods(const SDNode *N) { switch (N->getOpcode()) { case ISD::CopyToReg: - case ISD::SELECT: case ISD::FDIV: case ISD::FREM: case ISD::INLINEASM: @@ -629,6 +669,8 @@ static bool hasSourceMods(const SDNode *N) { return true; } } + case ISD::SELECT: + return selectSupportsSourceMods(N); default: return true; } @@ -644,6 +686,8 @@ bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N, unsigned NumMayIncreaseSize = 0; MVT VT = N->getValueType(0).getScalarType().getSimpleVT(); + assert(!N->use_empty()); + // XXX - Should this limit number of uses to check? for (const SDNode *U : N->uses()) { if (!hasSourceMods(U)) @@ -800,6 +844,17 @@ SDValue AMDGPUTargetLowering::getNegatedExpression( return SDValue(); break; } + case AMDGPUISD::RCP: { + SDValue Src = Op.getOperand(0); + EVT VT = Op.getValueType(); + SDLoc SL(Op); + + SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations, + ForCodeSize, Cost, Depth + 1); + if (NegSrc) + return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags()); + return SDValue(); + } default: break; } @@ -827,7 +882,7 @@ bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16; } -bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, +bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const { return true; @@ -888,10 +943,6 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { return Src == MVT::i32 && Dest == MVT::i64; } -bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { - return isZExtFree(Val.getValueType(), VT2); -} - bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { // There aren't really 64-bit registers, but pairs of 32-bit ones and only a // limited number of native 64-bit operations. Shrinking an operation to fit @@ -1021,7 +1072,7 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( const Function &Fn = MF.getFunction(); LLVMContext &Ctx = Fn.getParent()->getContext(); const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); - const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn); + const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(); CallingConv::ID CC = Fn.getCallingConv(); Align MaxAlign = Align(1); @@ -1258,12 +1309,15 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, return LowerFROUNDEVEN(Op, DAG); case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); + case ISD::FLOG2: + return LowerFLOG2(Op, DAG); case ISD::FLOG: - return LowerFLOG(Op, DAG, numbers::ln2f); case ISD::FLOG10: - return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f); + return LowerFLOGCommon(Op, DAG); case ISD::FEXP: return lowerFEXP(Op, DAG); + case ISD::FEXP2: + return lowerFEXP2(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); @@ -1292,6 +1346,23 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do // nothing here and let the illegal result integer be handled normally. return; + case ISD::FLOG2: + if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG)) + Results.push_back(Lowered); + return; + case ISD::FLOG: + case ISD::FLOG10: + if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG)) + Results.push_back(Lowered); + return; + case ISD::FEXP2: + if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG)) + Results.push_back(Lowered); + return; + case ISD::FEXP: + if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG)) + Results.push_back(Lowered); + return; default: return; } @@ -1305,6 +1376,13 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = G->getGlobal(); + if (!MFI->isModuleEntryFunction()) { + if (std::optional<uint32_t> Address = + AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) { + return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType()); + } + } + if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) { if (!MFI->isModuleEntryFunction() && @@ -1378,43 +1456,60 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { - + SDLoc SL(Op); SmallVector<SDValue, 8> Args; unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); EVT VT = Op.getValueType(); EVT SrcVT = Op.getOperand(0).getValueType(); - // For these types, we have some TableGen patterns except if the index is 1 - if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) || - (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) && - Start != 1) - return Op; + if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) { + unsigned NumElt = VT.getVectorNumElements(); + unsigned NumSrcElt = SrcVT.getVectorNumElements(); + assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types"); - if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) || - (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) && - (Start == 0 || Start == 4)) - return Op; + // Extract 32-bit registers at a time. + EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2); + EVT NewVT = NumElt == 2 + ? MVT::i32 + : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2); + SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0)); - if (((SrcVT == MVT::v16f16 && VT == MVT::v8f16) || - (SrcVT == MVT::v16i16 && VT == MVT::v8i16)) && - (Start == 0 || Start == 8)) - return Op; + DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2); + if (NumElt == 2) + Tmp = Args[0]; + else + Tmp = DAG.getBuildVector(NewVT, SL, Args); + + return DAG.getNode(ISD::BITCAST, SL, VT, Tmp); + } DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, VT.getVectorNumElements()); - return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args); + return DAG.getBuildVector(Op.getValueType(), SL, Args); +} + +// TODO: Handle fabs too +static SDValue peekFNeg(SDValue Val) { + if (Val.getOpcode() == ISD::FNEG) + return Val.getOperand(0); + + return Val; } -/// Generate Min/Max node -SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT, - SDValue LHS, SDValue RHS, - SDValue True, SDValue False, - SDValue CC, - DAGCombinerInfo &DCI) const { - if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) - return SDValue(); +static SDValue peekFPSignOps(SDValue Val) { + if (Val.getOpcode() == ISD::FNEG) + Val = Val.getOperand(0); + if (Val.getOpcode() == ISD::FABS) + Val = Val.getOperand(0); + if (Val.getOpcode() == ISD::FCOPYSIGN) + Val = Val.getOperand(0); + return Val; +} +SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl( + const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, + SDValue False, SDValue CC, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); switch (CCOpcode) { @@ -1480,6 +1575,45 @@ SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT, return SDValue(); } +/// Generate Min/Max node +SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT, + SDValue LHS, SDValue RHS, + SDValue True, SDValue False, + SDValue CC, + DAGCombinerInfo &DCI) const { + if ((LHS == True && RHS == False) || (LHS == False && RHS == True)) + return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI); + + SelectionDAG &DAG = DCI.DAG; + + // If we can't directly match this, try to see if we can fold an fneg to + // match. + + ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); + ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False); + SDValue NegTrue = peekFNeg(True); + + // Undo the combine foldFreeOpFromSelect does if it helps us match the + // fmin/fmax. + // + // select (fcmp olt (lhs, K)), (fneg lhs), -K + // -> fneg (fmin_legacy lhs, K) + // + // TODO: Use getNegatedExpression + if (LHS == NegTrue && CFalse && CRHS) { + APFloat NegRHS = neg(CRHS->getValueAPF()); + if (NegRHS == CFalse->getValueAPF()) { + SDValue Combined = + combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI); + if (Combined) + return DAG.getNode(ISD::FNEG, DL, VT, Combined); + return SDValue(); + } + } + + return SDValue(); +} + std::pair<SDValue, SDValue> AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); @@ -1749,7 +1883,8 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool UseFmadFtz = false; if (Subtarget->isGCN()) { const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - UseFmadFtz = MFI->getMode().allFP32Denormals(); + UseFmadFtz = + MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign(); } // float fr = mad(fqneg, fb, fa); @@ -1811,13 +1946,13 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, SDValue Zero = DAG.getConstant(0, DL, HalfVT); //HiLo split + SDValue LHS_Lo, LHS_Hi; SDValue LHS = Op.getOperand(0); - SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); - SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One); + std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT); + SDValue RHS_Lo, RHS_Hi; SDValue RHS = Op.getOperand(1); - SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); - SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One); + std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT); if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { @@ -1841,11 +1976,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // Compute denominator reciprocal. - unsigned FMAD = !Subtarget->hasMadMacF32Insts() ? - (unsigned)ISD::FMA : - !MFI->getMode().allFP32Denormals() ? - (unsigned)ISD::FMAD : - (unsigned)AMDGPUISD::FMAD_FTZ; + unsigned FMAD = + !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA + : MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign() + ? (unsigned)ISD::FMAD + : (unsigned)AMDGPUISD::FMAD_FTZ; SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo); SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi); @@ -1875,13 +2010,12 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS); SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64); SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1); - SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, - Zero); - SDValue Mulhi1_Hi = - DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, One); - SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo, + SDValue Mulhi1_Lo, Mulhi1_Hi; + std::tie(Mulhi1_Lo, Mulhi1_Hi) = + DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT); + SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo, Mulhi1_Lo, Zero1); - SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi, + SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi, Mulhi1_Hi, Add1_Lo.getValue(1)); SDValue Add1 = DAG.getBitcast(VT, DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi})); @@ -1889,13 +2023,12 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, // Second round of UNR. SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1); SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2); - SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, - Zero); - SDValue Mulhi2_Hi = - DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, One); - SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo, + SDValue Mulhi2_Lo, Mulhi2_Hi; + std::tie(Mulhi2_Lo, Mulhi2_Hi) = + DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT); + SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo, Mulhi2_Lo, Zero1); - SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Hi, + SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi, Mulhi2_Hi, Add2_Lo.getValue(1)); SDValue Add2 = DAG.getBitcast(VT, DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi})); @@ -1904,11 +2037,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3); - SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero); - SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One); - SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo, + SDValue Mul3_Lo, Mul3_Hi; + std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT); + SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo, Mul3_Lo, Zero1); - SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi, + SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi, Mul3_Hi, Sub1_Lo.getValue(1)); SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi); SDValue Sub1 = DAG.getBitcast(VT, @@ -1926,11 +2059,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, // potential endif to substitute PHIs. // if C3 != 0 ... - SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo, + SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo, RHS_Lo, Zero1); - SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi, + SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi, RHS_Hi, Sub1_Lo.getValue(1)); - SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi, + SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi, Zero, Sub2_Lo.getValue(1)); SDValue Sub2 = DAG.getBitcast(VT, DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi})); @@ -1946,11 +2079,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, // if (C6 != 0) SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64); - SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo, + SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo, RHS_Lo, Zero1); - SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi, + SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi, RHS_Hi, Sub2_Lo.getValue(1)); - SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi, + SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi, Zero, Sub3_Lo.getValue(1)); SDValue Sub3 = DAG.getBitcast(VT, DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi})); @@ -2329,27 +2462,445 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } -SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG, - double Log2BaseInverted) const { - EVT VT = Op.getValueType(); +/// Return true if it's known that \p Src can never be an f32 denormal value. +static bool valueIsKnownNeverF32Denorm(SDValue Src) { + switch (Src.getOpcode()) { + case ISD::FP_EXTEND: + return Src.getOperand(0).getValueType() == MVT::f16; + case ISD::FP16_TO_FP: + return true; + default: + return false; + } + + llvm_unreachable("covered opcode switch"); +} + +static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags) { + if (Flags.hasApproximateFuncs()) + return true; + auto &Options = DAG.getTarget().Options; + return Options.UnsafeFPMath || Options.ApproxFuncFPMath; +} + +static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, + SDNodeFlags Flags) { + return !valueIsKnownNeverF32Denorm(Src) && + DAG.getMachineFunction() + .getDenormalMode(APFloat::IEEEsingle()) + .Input != DenormalMode::PreserveSign; +} + +SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG, + SDValue Src, + SDNodeFlags Flags) const { + SDLoc SL(Src); + EVT VT = Src.getValueType(); + const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT); + SDValue SmallestNormal = + DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT); + + // Want to scale denormals up, but negatives and 0 work just as well on the + // scaled path. + SDValue IsLtSmallestNormal = DAG.getSetCC( + SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src, + SmallestNormal, ISD::SETOLT); + + return IsLtSmallestNormal; +} + +SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src, + SDNodeFlags Flags) const { + SDLoc SL(Src); + EVT VT = Src.getValueType(); + const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT); + SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT); + + SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags); + SDValue IsFinite = DAG.getSetCC( + SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs, + Inf, ISD::SETOLT); + return IsFinite; +} + +/// If denormal handling is required return the scaled input to FLOG2, and the +/// check for denormal range. Otherwise, return null values. +std::pair<SDValue, SDValue> +AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, + SDValue Src, SDNodeFlags Flags) const { + if (allowApproxFunc(DAG, Flags) || !needsDenormHandlingF32(DAG, Src, Flags)) + return {}; + + MVT VT = MVT::f32; + const fltSemantics &Semantics = APFloat::IEEEsingle(); + SDValue SmallestNormal = + DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT); + + SDValue IsLtSmallestNormal = DAG.getSetCC( + SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src, + SmallestNormal, ISD::SETOLT); + + SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT); + SDValue One = DAG.getConstantFP(1.0, SL, VT); + SDValue ScaleFactor = + DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags); + + SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags); + return {ScaledInput, IsLtSmallestNormal}; +} + +SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const { + // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals. + // If we have to handle denormals, scale up the input and adjust the result. + + // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0) + // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0) SDLoc SL(Op); - SDValue Operand = Op.getOperand(0); - SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand); - SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT); + EVT VT = Op.getValueType(); + SDValue Src = Op.getOperand(0); + SDNodeFlags Flags = Op->getFlags(); + + if (VT == MVT::f16) { + // Nothing in half is a denormal when promoted to f32. + assert(!Subtarget->has16BitInsts()); + SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags); + SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags); + return DAG.getNode(ISD::FP_ROUND, SL, VT, Log, + DAG.getTargetConstant(0, SL, MVT::i32), Flags); + } + + auto [ScaledInput, IsLtSmallestNormal] = + getScaledLogInput(DAG, SL, Src, Flags); + if (!ScaledInput) + return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags); + + SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags); - return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand); + SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT); + SDValue Zero = DAG.getConstantFP(0.0, SL, VT); + SDValue ResultOffset = + DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero); + return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags); } -// exp2(M_LOG2E_F * f); -SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { +static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, + SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) { + SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags); + return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags); +} + +SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, + SelectionDAG &DAG) const { + SDValue X = Op.getOperand(0); EVT VT = Op.getValueType(); + SDNodeFlags Flags = Op->getFlags(); + SDLoc DL(Op); + + const bool IsLog10 = Op.getOpcode() == ISD::FLOG10; + assert(IsLog10 || Op.getOpcode() == ISD::FLOG); + + const auto &Options = getTargetMachine().Options; + if (VT == MVT::f16 || Flags.hasApproximateFuncs() || + Options.ApproxFuncFPMath || Options.UnsafeFPMath) { + + if (VT == MVT::f16 && !Subtarget->has16BitInsts()) { + // Log and multiply in f32 is good enough for f16. + X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags); + } + + SDValue Lowered = LowerFLOGUnsafe( + X, DL, DAG, IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2, + Flags); + if (VT == MVT::f16 && !Subtarget->has16BitInsts()) { + return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered, + DAG.getTargetConstant(0, DL, MVT::i32), Flags); + } + + return Lowered; + } + + auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags); + if (ScaledInput) + X = ScaledInput; + + SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags); + + SDValue R; + if (Subtarget->hasFastFMAF32()) { + // c+cc are ln(2)/ln(10) to more than 49 bits + const float c_log10 = 0x1.344134p-2f; + const float cc_log10 = 0x1.09f79ep-26f; + + // c + cc is ln(2) to more than 49 bits + const float c_log = 0x1.62e42ep-1f; + const float cc_log = 0x1.efa39ep-25f; + + SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT); + SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT); + + R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags); + SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags); + SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags); + SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags); + R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags); + } else { + // ch+ct is ln(2)/ln(10) to more than 36 bits + const float ch_log10 = 0x1.344000p-2f; + const float ct_log10 = 0x1.3509f6p-18f; + + // ch + ct is ln(2) to more than 36 bits + const float ch_log = 0x1.62e000p-1f; + const float ct_log = 0x1.0bfbe8p-15f; + + SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT); + SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT); + + SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y); + SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32); + SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst); + SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt); + SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags); + + SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags); + SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags); + SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags); + R = getMad(DAG, DL, VT, YH, CH, Mad1); + } + + const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && + (Flags.hasNoInfs() || Options.NoInfsFPMath); + + // TODO: Check if known finite from source value. + if (!IsFiniteOnly) { + SDValue IsFinite = getIsFinite(DAG, Y, Flags); + R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags); + } + + if (IsScaled) { + SDValue Zero = DAG.getConstantFP(0.0f, DL, VT); + SDValue ShiftK = + DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT); + SDValue Shift = + DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags); + R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags); + } + + return R; +} + +SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const { + return LowerFLOGCommon(Op, DAG); +} + +// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a +// promote f16 operation. +SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL, + SelectionDAG &DAG, + double Log2BaseInverted, + SDNodeFlags Flags) const { + EVT VT = Src.getValueType(); + unsigned LogOp = VT == MVT::f32 ? AMDGPUISD::LOG : ISD::FLOG2; + SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags); + SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT); + + return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand, + Flags); +} + +SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const { + // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals. + // If we have to handle denormals, scale up the input and adjust the result. + SDLoc SL(Op); + EVT VT = Op.getValueType(); SDValue Src = Op.getOperand(0); + SDNodeFlags Flags = Op->getFlags(); + + if (VT == MVT::f16) { + // Nothing in half is a denormal when promoted to f32. + assert(!Subtarget->has16BitInsts()); + SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags); + SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags); + return DAG.getNode(ISD::FP_ROUND, SL, VT, Log, + DAG.getTargetConstant(0, SL, MVT::i32), Flags); + } + assert(VT == MVT::f32); + + if (allowApproxFunc(DAG, Flags) || !needsDenormHandlingF32(DAG, Src, Flags)) + return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags); + + // bool needs_scaling = x < -0x1.f80000p+6f; + // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f); + + // -nextafter(128.0, -1) + SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT); + + EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + + SDValue NeedsScaling = + DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT); + + SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT); + SDValue Zero = DAG.getConstantFP(0.0, SL, VT); + + SDValue AddOffset = + DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero); + + SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags); + SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags); + + SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT); + SDValue One = DAG.getConstantFP(1.0, SL, VT); + SDValue ResultScale = + DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One); + + return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags); +} + +SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, + SelectionDAG &DAG, + SDNodeFlags Flags) const { + // exp2(M_LOG2E_F * f); + EVT VT = Op.getValueType(); const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags()); - return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags()); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Op, K, Flags); + return DAG.getNode(VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2, SL, VT, Mul, + Flags); +} + +SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + SDNodeFlags Flags = Op->getFlags(); + const bool IsExp10 = false; // TODO: For some reason exp10 is missing + + if (VT.getScalarType() == MVT::f16) { + // v_exp_f16 (fmul x, log2e) + if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast? + return lowerFEXPUnsafe(X, SL, DAG, Flags); + + if (VT.isVector()) + return SDValue(); + + // exp(f16 x) -> + // fptrunc (v_exp_f32 (fmul (fpext x), log2e)) + + // Nothing in half is a denormal when promoted to f32. + SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags); + SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags); + return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered, + DAG.getTargetConstant(0, SL, MVT::i32), Flags); + } + + assert(VT == MVT::f32); + + // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying + // library behavior. Also, is known-not-daz source sufficient? + if (allowApproxFunc(DAG, Flags) && !needsDenormHandlingF32(DAG, X, Flags)) { + assert(!IsExp10 && "todo exp10 support"); + return lowerFEXPUnsafe(X, SL, DAG, Flags); + } + + // Algorithm: + // + // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) + // + // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer + // n = 64*m + j, 0 <= j < 64 + // + // e^x = 2^((64*m + j + f)/64) + // = (2^m) * (2^(j/64)) * 2^(f/64) + // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) + // + // f = x*(64/ln(2)) - n + // r = f*(ln(2)/64) = x - n*(ln(2)/64) + // + // e^x = (2^m) * (2^(j/64)) * e^r + // + // (2^(j/64)) is precomputed + // + // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + // e^r = 1 + q + // + // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + // + // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) + SDNodeFlags FlagsNoContract = Flags; + FlagsNoContract.setAllowContract(false); + + SDValue PH, PL; + if (Subtarget->hasFastFMAF32()) { + const float c_exp = numbers::log2ef; + const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits + const float c_exp10 = 0x1.a934f0p+1f; + const float cc_exp10 = 0x1.2f346ep-24f; + + SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT); + SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT); + + PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags); + SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags); + SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags); + PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags); + } else { + const float ch_exp = 0x1.714000p+0f; + const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits + + const float ch_exp10 = 0x1.a92000p+1f; + const float cl_exp10 = 0x1.4f0978p-11f; + + SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT); + SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT); + + SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X); + SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32); + SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst); + SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt); + SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags); + + PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags); + + SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags); + SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags); + PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags); + } + + SDValue E = DAG.getNode(ISD::FRINT, SL, VT, PH, Flags); + + // It is unsafe to contract this fsub into the PH multiply. + SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract); + + SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags); + SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E); + SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags); + + SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags); + + SDValue UnderflowCheckConst = + DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT); + + EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + SDValue Zero = DAG.getConstantFP(0.0, SL, VT); + SDValue Underflow = + DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT); + + R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R); + const auto &Options = getTargetMachine().Options; + + if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) { + SDValue OverflowCheckConst = + DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT); + SDValue Overflow = + DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT); + SDValue Inf = + DAG.getConstantFP(APFloat::getInf(APFloat::IEEEsingle()), SL, VT); + R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R); + } + + return R; } static bool isCtlzOpc(unsigned Opc) { @@ -2518,7 +3069,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, ShAmt); // On GCN, use LDEXP directly. if (Subtarget->isGCN()) - return DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f32, FVal, ShAmt); + return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt); // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent // part directly to emulate the multiplication of 2^ShAmt. That 8-bit @@ -2551,7 +3102,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo); - SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi, + SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi, DAG.getConstant(32, SL, MVT::i32)); // TODO: Should this propagate fast-math-flags? return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); @@ -2670,15 +3221,17 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, SDValue K0, K1; if (SrcVT == MVT::f64) { - K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)), - SL, SrcVT); - K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), - SL, SrcVT); + K0 = DAG.getConstantFP( + llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL, + SrcVT); + K1 = DAG.getConstantFP( + llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL, + SrcVT); } else { - K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)), SL, - SrcVT); - K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)), SL, - SrcVT); + K0 = DAG.getConstantFP( + llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT); + K1 = DAG.getConstantFP( + llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT); } // TODO: Should this propagate fast-math-flags? SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0); @@ -3128,6 +3681,17 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( SDValue Src = N->getOperand(1); return Src.isUndef() ? Src : SDValue(); } + case Intrinsic::amdgcn_frexp_exp: { + // frexp_exp (fneg x) -> frexp_exp x + // frexp_exp (fabs x) -> frexp_exp x + // frexp_exp (fneg (fabs x)) -> frexp_exp x + SDValue Src = N->getOperand(1); + SDValue PeekSign = peekFPSignOps(Src); + if (PeekSign == Src) + return SDValue(); + return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign), + 0); + } default: return SDValue(); } @@ -3419,6 +3983,16 @@ static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi); } +/// If \p V is an add of a constant 1, returns the other operand. Otherwise +/// return SDValue(). +static SDValue getAddOneOp(const SDNode *V) { + if (V->getOpcode() != ISD::ADD) + return SDValue(); + + auto *C = dyn_cast<ConstantSDNode>(V->getOperand(1)); + return C && C->isOne() ? V->getOperand(0) : SDValue(); +} + SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); @@ -3434,16 +4008,49 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, if (VT.isVector() || Size > 64) return SDValue(); - // There are i16 integer mul/mad. - if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16)) - return SDValue(); - SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad + // matching. + + // mul x, (add y, 1) -> add (mul x, y), x + auto IsFoldableAdd = [](SDValue V) -> SDValue { + SDValue AddOp = getAddOneOp(V.getNode()); + if (!AddOp) + return SDValue(); + + if (V.hasOneUse() || all_of(V->uses(), [](const SDNode *U) -> bool { + return U->getOpcode() == ISD::MUL; + })) + return AddOp; + + return SDValue(); + }; + + // FIXME: The selection pattern is not properly checking for commuted + // operands, so we have to place the mul in the LHS + if (SDValue MulOper = IsFoldableAdd(N0)) { + SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper); + return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1); + } + + if (SDValue MulOper = IsFoldableAdd(N1)) { + SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper); + return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0); + } + + // Skip if already mul24. + if (N->getOpcode() != ISD::MUL) + return SDValue(); + + // There are i16 integer mul/mad. + if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16)) + return SDValue(); + // SimplifyDemandedBits has the annoying habit of turning useful zero_extends // in the source into any_extends if the result of the mul is truncated. Since // we can assume the high bits are whatever we want, use the underlying value @@ -3583,12 +4190,6 @@ SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N, return DAG.getZExtOrTrunc(Mulhi, DL, VT); } -static bool isNegativeOne(SDValue Val) { - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) - return C->isAllOnes(); - return false; -} - SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, @@ -3631,7 +4232,7 @@ SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue C // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x if (CCOpcode == ISD::SETEQ && (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && - RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) { + RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) { unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32; return getFFBX_U32(DAG, CmpLHS, SL, Opc); @@ -3641,7 +4242,7 @@ SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue C // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x if (CCOpcode == ISD::SETNE && (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) && - LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) { + LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) { unsigned Opc = isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32; @@ -3673,8 +4274,9 @@ static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, // // select c, (fabs x), (fabs y) -> fabs (select c, x, y) // select c, (fabs x), +k -> fabs (select c, x, k) -static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, - SDValue N) { +SDValue +AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, + SDValue N) const { SelectionDAG &DAG = DCI.DAG; SDValue Cond = N.getOperand(0); SDValue LHS = N.getOperand(1); @@ -3683,6 +4285,9 @@ static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, EVT VT = N.getValueType(); if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) || (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) { + if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode())) + return SDValue(); + return distributeOpThroughSelect(DCI, LHS.getOpcode(), SDLoc(N), Cond, LHS, RHS); } @@ -3695,7 +4300,8 @@ static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, // TODO: Support vector constants. ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); - if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) { + if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS && + !selectSupportsSourceMods(N.getNode())) { SDLoc SL(N); // If one side is an fneg/fabs and the other is a constant, we can push the // fneg/fabs down. If it's an fabs, the constant needs to be non-negative. @@ -3707,17 +4313,31 @@ static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, if (NewLHS.hasOneUse()) { unsigned Opc = NewLHS.getOpcode(); - if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc)) + if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode())) ShouldFoldNeg = false; if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL) ShouldFoldNeg = false; } if (ShouldFoldNeg) { + if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative()) + return SDValue(); + + // We're going to be forced to use a source modifier anyway, there's no + // point to pulling the negate out unless we can get a size reduction by + // negating the constant. + // + // TODO: Generalize to use getCheaperNegatedExpression which doesn't know + // about cheaper constants. + if (NewLHS.getOpcode() == ISD::FABS && + getConstantNegateCost(CRHS) != NegatibleCost::Cheaper) + return SDValue(); + + if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode())) + return SDValue(); + if (LHS.getOpcode() == ISD::FNEG) NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); - else if (CRHS->isNegative()) - return SDValue(); if (Inv) std::swap(NewLHS, NewRHS); @@ -3732,7 +4352,6 @@ static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } - SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) @@ -3791,15 +4410,26 @@ static bool isInv2Pi(const APFloat &APF) { // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an // additional cost to negate them. -bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const { - if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) { - if (C->isZero() && !C->isNegative()) - return true; +TargetLowering::NegatibleCost +AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const { + if (C->isZero()) + return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive; - if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF())) - return true; - } + if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF())) + return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive; + + return NegatibleCost::Neutral; +} + +bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const { + if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) + return getConstantNegateCost(C) == NegatibleCost::Expensive; + return false; +} +bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const { + if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) + return getConstantNegateCost(C) == NegatibleCost::Cheaper; return false; } @@ -3822,14 +4452,9 @@ static unsigned inverseMinMax(unsigned Opc) { } } -SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; - SDValue N0 = N->getOperand(0); - EVT VT = N->getValueType(0); - - unsigned Opc = N0.getOpcode(); - +/// \return true if it's profitable to try to push an fneg into its source +/// instruction. +bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) { // If the input has multiple uses and we can either fold the negate down, or // the other uses cannot, give up. This both prevents unprofitable // transformations and infinite loops: we won't repeatedly try to fold around @@ -3838,13 +4463,27 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, // This may be able to fold into the source, but at a code size cost. Don't // fold if the fold into the user is free. if (allUsesHaveSourceMods(N, 0)) - return SDValue(); + return false; } else { - if (fnegFoldsIntoOp(Opc) && + if (fnegFoldsIntoOp(N0.getNode()) && (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode()))) - return SDValue(); + return false; } + return true; +} + +SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + unsigned Opc = N0.getOpcode(); + + if (!shouldFoldFNegIntoSrc(N, N0)) + return SDValue(); + SDLoc SL(N); switch (Opc) { case ISD::FADD: { @@ -4027,6 +4666,67 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, DAG.getConstant(0x8000, SL, SrcVT)); return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg); } + case ISD::SELECT: { + // fneg (select c, a, b) -> select c, (fneg a), (fneg b) + // TODO: Invert conditions of foldFreeOpFromSelect + return SDValue(); + } + case ISD::BITCAST: { + SDLoc SL(N); + SDValue BCSrc = N0.getOperand(0); + if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) { + SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1); + if (HighBits.getValueType().getSizeInBits() != 32 || + !fnegFoldsIntoOp(HighBits.getNode())) + return SDValue(); + + // f64 fneg only really needs to operate on the high half of of the + // register, so try to force it to an f32 operation to help make use of + // source modifiers. + // + // + // fneg (f64 (bitcast (build_vector x, y))) -> + // f64 (bitcast (build_vector (bitcast i32:x to f32), + // (fneg (bitcast i32:y to f32))) + + SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits); + SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi); + SDValue CastBack = + DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi); + + SmallVector<SDValue, 8> Ops(BCSrc->op_begin(), BCSrc->op_end()); + Ops.back() = CastBack; + DCI.AddToWorklist(NegHi.getNode()); + SDValue Build = + DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops); + SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build); + + if (!N0.hasOneUse()) + DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result)); + return Result; + } + + if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 && + BCSrc.hasOneUse()) { + // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) -> + // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32) + + // TODO: Cast back result for multiple uses is beneficial in some cases. + + SDValue LHS = + DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1)); + SDValue RHS = + DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2)); + + SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS); + SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS); + + return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS, + NegRHS); + } + + return SDValue(); + } default: return SDValue(); } @@ -4158,6 +4858,15 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performTruncateCombine(N, DCI); case ISD::MUL: return performMulCombine(N, DCI); + case AMDGPUISD::MUL_U24: + case AMDGPUISD::MUL_I24: { + if (SDValue Simplified = simplifyMul24(N, DCI)) + return Simplified; + return performMulCombine(N, DCI); + } + case AMDGPUISD::MULHI_I24: + case AMDGPUISD::MULHI_U24: + return simplifyMul24(N, DCI); case ISD::SMUL_LOHI: case ISD::UMUL_LOHI: return performMulLoHiCombine(N, DCI); @@ -4165,11 +4874,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performMulhsCombine(N, DCI); case ISD::MULHU: return performMulhuCombine(N, DCI); - case AMDGPUISD::MUL_I24: - case AMDGPUISD::MUL_U24: - case AMDGPUISD::MULHI_I24: - case AMDGPUISD::MULHI_U24: - return simplifyMul24(N, DCI); case ISD::SELECT: return performSelectCombine(N, DCI); case ISD::FNEG: @@ -4365,7 +5069,7 @@ SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG, return V; unsigned Mask = Arg.getMask(); - unsigned Shift = countTrailingZeros<unsigned>(Mask); + unsigned Shift = llvm::countr_zero<unsigned>(Mask); V = DAG.getNode(ISD::SRL, SL, VT, V, DAG.getShiftAmountConstant(Shift, VT, SL)); return DAG.getNode(ISD::AND, SL, VT, V, @@ -4373,14 +5077,11 @@ SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG, } uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( - const MachineFunction &MF, const ImplicitParameter Param) const { - const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); - const AMDGPUSubtarget &ST = - AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction()); - unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction()); - const Align Alignment = ST.getAlignmentForImplicitArgPtr(); - uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) + - ExplicitArgOffset; + uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const { + unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset(); + const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr(); + uint64_t ArgOffset = + alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset; switch (Param) { case FIRST_IMPLICIT: return ArgOffset; @@ -4394,6 +5095,12 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( llvm_unreachable("unexpected implicit parameter type"); } +uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( + const MachineFunction &MF, const ImplicitParameter Param) const { + const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); + return getImplicitParameterOffset(MFI->getExplicitKernArgSize(), Param); +} + #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { @@ -4409,10 +5116,12 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(LOOP) NODE_NAME_CASE(CALL) NODE_NAME_CASE(TC_RETURN) + NODE_NAME_CASE(TC_RETURN_GFX) NODE_NAME_CASE(TRAP) - NODE_NAME_CASE(RET_FLAG) + NODE_NAME_CASE(RET_GLUE) NODE_NAME_CASE(RETURN_TO_EPILOG) NODE_NAME_CASE(ENDPGM) + NODE_NAME_CASE(ENDPGM_TRAP) NODE_NAME_CASE(DWORDADDR) NODE_NAME_CASE(FRACT) NODE_NAME_CASE(SETCC) @@ -4444,9 +5153,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(RSQ) NODE_NAME_CASE(RCP_LEGACY) NODE_NAME_CASE(RCP_IFLAG) + NODE_NAME_CASE(LOG) + NODE_NAME_CASE(EXP) NODE_NAME_CASE(FMUL_LEGACY) NODE_NAME_CASE(RSQ_CLAMP) - NODE_NAME_CASE(LDEXP) NODE_NAME_CASE(FP_CLASS) NODE_NAME_CASE(DOT4) NODE_NAME_CASE(CARRY) @@ -4508,8 +5218,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(DS_ORDERED_COUNT) NODE_NAME_CASE(ATOMIC_CMP_SWAP) - NODE_NAME_CASE(ATOMIC_INC) - NODE_NAME_CASE(ATOMIC_DEC) NODE_NAME_CASE(ATOMIC_LOAD_FMIN) NODE_NAME_CASE(ATOMIC_LOAD_FMAX) NODE_NAME_CASE(BUFFER_LOAD) @@ -4725,31 +5433,38 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( Known.Zero.setLowBits(Log2(Alignment)); break; } + case AMDGPUISD::SMIN3: + case AMDGPUISD::SMAX3: + case AMDGPUISD::SMED3: + case AMDGPUISD::UMIN3: + case AMDGPUISD::UMAX3: + case AMDGPUISD::UMED3: { + KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1); + if (Known2.isUnknown()) + break; + + KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); + if (Known1.isUnknown()) + break; + + KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); + if (Known0.isUnknown()) + break; + + // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling. + Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero; + Known.One = Known0.One & Known1.One & Known2.One; + break; + } case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); switch (IID) { - case Intrinsic::amdgcn_mbcnt_lo: - case Intrinsic::amdgcn_mbcnt_hi: { - const GCNSubtarget &ST = - DAG.getMachineFunction().getSubtarget<GCNSubtarget>(); - // These return at most the (wavefront size - 1) + src1 - // As long as src1 is an immediate we can calc known bits - KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1); - unsigned Src1ValBits = Src1Known.countMaxActiveBits(); - unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2()); - // Cater for potential carry - MaxActiveBits += Src1ValBits ? 1 : 0; - unsigned Size = Op.getValueType().getSizeInBits(); - if (MaxActiveBits < Size) - Known.Zero.setHighBits(Size - MaxActiveBits); - break; - } case Intrinsic::amdgcn_workitem_id_x: case Intrinsic::amdgcn_workitem_id_y: case Intrinsic::amdgcn_workitem_id_z: { unsigned MaxValue = Subtarget->getMaxWorkitemID( DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID)); - Known.Zero.setHighBits(countLeadingZeros(MaxValue)); + Known.Zero.setHighBits(llvm::countl_zero(MaxValue)); break; } default: @@ -4795,6 +5510,26 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( return 16; case AMDGPUISD::FP_TO_FP16: return 16; + case AMDGPUISD::SMIN3: + case AMDGPUISD::SMAX3: + case AMDGPUISD::SMED3: + case AMDGPUISD::UMIN3: + case AMDGPUISD::UMAX3: + case AMDGPUISD::UMED3: { + unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1); + if (Tmp2 == 1) + return 1; // Early out. + + unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1); + if (Tmp1 == 1) + return 1; // Early out. + + unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); + if (Tmp0 == 1) + return 1; // Early out. + + return std::min(Tmp0, std::min(Tmp1, Tmp2)); + } default: return 1; } @@ -4818,6 +5553,20 @@ unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr( return 24; case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: return 16; + case AMDGPU::G_AMDGPU_SMED3: + case AMDGPU::G_AMDGPU_UMED3: { + auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs(); + unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1); + if (Tmp2 == 1) + return 1; + unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1); + if (Tmp1 == 1) + return 1; + unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1); + if (Tmp0 == 1) + return 1; + return std::min(Tmp0, std::min(Tmp1, Tmp2)); + } default: return 1; } @@ -4871,7 +5620,7 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, // TODO: Need is known positive check. return false; } - case AMDGPUISD::LDEXP: + case ISD::FLDEXP: case AMDGPUISD::FRACT: { if (SNaN) return true; @@ -4936,6 +5685,11 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, } } +bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI, + Register N0, Register N1) const { + return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks +} + TargetLowering::AtomicExpansionKind AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { switch (RMW->getOperation()) { @@ -4962,3 +5716,22 @@ bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtractLegal( return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) && Ty2 == LLT::scalar(32); } + +/// Whether it is profitable to sink the operands of an +/// Instruction I to the basic block of I. +/// This helps using several modifiers (like abs and neg) more often. +bool AMDGPUTargetLowering::shouldSinkOperands( + Instruction *I, SmallVectorImpl<Use *> &Ops) const { + using namespace PatternMatch; + + for (auto &Op : I->operands()) { + // Ensure we are not already sinking this operand. + if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); })) + continue; + + if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) + Ops.push_back(&Op); + } + + return !Ops.empty(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index bc3b57a82d08..26b91155ba85 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -60,8 +60,23 @@ protected: SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFLOG(SDValue Op, SelectionDAG &DAG, - double Log2BaseInverted) const; + + SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, + SDNodeFlags Flags) const; + SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const; + std::pair<SDValue, SDValue> getScaledLogInput(SelectionDAG &DAG, + const SDLoc SL, SDValue Op, + SDNodeFlags Flags) const; + + SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, + double Log2BaseInverted, SDNodeFlags Flags) const; + SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const; + + SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, + SDNodeFlags Flags) const; SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const; @@ -97,9 +112,16 @@ protected: SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const; + + SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, + SDValue N) const; SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; + TargetLowering::NegatibleCost + getConstantNegateCost(const ConstantFPSDNode *C) const; + bool isConstantCostlierToNegate(SDValue N) const; + bool isConstantCheaperToNegate(SDValue N) const; SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const; @@ -156,6 +178,7 @@ public: return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; } + static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc); static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 4); bool isFAbsFree(EVT VT) const override; @@ -165,14 +188,13 @@ public: bool isZExtFree(Type *Src, Type *Dest) const override; bool isZExtFree(EVT Src, EVT Dest) const override; - bool isZExtFree(SDValue Val, EVT VT2) const override; SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override; - bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; + bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override; bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override; @@ -193,7 +215,7 @@ public: bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final; - bool storeOfVectorConstantIsCheap(EVT MemVT, + bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override; bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override; @@ -229,6 +251,10 @@ public: SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const override; + SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, + SDValue RHS, SDValue True, SDValue False, + SDValue CC, DAGCombinerInfo &DCI) const; + SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const; @@ -281,6 +307,9 @@ public: bool SNaN = false, unsigned Depth = 0) const override; + bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, + Register N1) const override; + /// Helper function that adds Reg to the LiveIn list of the DAG's /// MachineFunction. /// @@ -333,6 +362,8 @@ public: /// type of implicit parameter. uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const; + uint32_t getImplicitParameterOffset(const uint64_t ExplicitKernArgSize, + const ImplicitParameter Param) const; MVT getFenceOperandTy(const DataLayout &DL) const override { return MVT::i32; @@ -342,6 +373,9 @@ public: bool isConstantUnsignedBitfieldExtractLegal(unsigned Opc, LLT Ty1, LLT Ty2) const override; + + bool shouldSinkOperands(Instruction *I, + SmallVectorImpl<Use *> &Ops) const override; }; namespace AMDGPUISD { @@ -356,6 +390,7 @@ enum NodeType : unsigned { // Function call. CALL, TC_RETURN, + TC_RETURN_GFX, TRAP, // Masked control flow nodes. @@ -366,11 +401,14 @@ enum NodeType : unsigned { // A uniform kernel return that terminates the wavefront. ENDPGM, + // s_endpgm, but we may want to insert it in the middle of the block. + ENDPGM_TRAP, + // Return to a shader part's epilog code. RETURN_TO_EPILOG, // Return with values from a non-entry function. - RET_FLAG, + RET_GLUE, DWORDADDR, FRACT, @@ -421,9 +459,15 @@ enum NodeType : unsigned { RSQ, RCP_LEGACY, RCP_IFLAG, + + // log2, no denormal handling for f32. + LOG, + + // exp2, no denormal handling for f32. + EXP, + FMUL_LEGACY, RSQ_CLAMP, - LDEXP, FP_CLASS, DOT4, CARRY, @@ -505,8 +549,6 @@ enum NodeType : unsigned { TBUFFER_LOAD_FORMAT_D16, DS_ORDERED_COUNT, ATOMIC_CMP_SWAP, - ATOMIC_INC, - ATOMIC_DEC, ATOMIC_LOAD_FMIN, ATOMIC_LOAD_FMAX, BUFFER_LOAD, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp index c9cdbc89f3a4..7619a39bac9c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp @@ -51,7 +51,7 @@ public: MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64) return true; if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && - (MI.getOperand(0).getImm() & 0xf000) == 0) + AMDGPU::DepCtr::decodeFieldVaVdst(MI.getOperand(0).getImm()) == 0) return true; return false; } @@ -77,11 +77,15 @@ public: struct DelayInfo { // One larger than the maximum number of (non-TRANS) VALU instructions we // can encode in an s_delay_alu instruction. - static const unsigned VALU_MAX = 5; + static constexpr unsigned VALU_MAX = 5; // One larger than the maximum number of TRANS instructions we can encode in // an s_delay_alu instruction. - static const unsigned TRANS_MAX = 4; + static constexpr unsigned TRANS_MAX = 4; + + // One larger than the maximum number of SALU cycles we can encode in an + // s_delay_alu instruction. + static constexpr unsigned SALU_CYCLES_MAX = 4; // If it was written by a (non-TRANS) VALU, remember how many clock cycles // are left until it completes, and how many other (non-TRANS) VALU we have @@ -120,7 +124,9 @@ public: TRANSNumVALU = 0; break; case SALU: - SALUCycles = Cycles; + // Guard against pseudo-instructions like SI_CALL which are marked as + // SALU but with a very high latency. + SALUCycles = std::min(Cycles, SALU_CYCLES_MAX); break; } } @@ -278,6 +284,7 @@ public: // Wait for an SALU instruction. if (Delay.SALUCycles) { + assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX); if (Imm & 0x780) { // We have already encoded a VALU and a TRANS delay. There's no room in // the encoding for an SALU delay as well, so just drop it. @@ -349,6 +356,7 @@ public: if (instructionWaitsForVALU(MI)) { // Forget about all outstanding VALU delays. + // TODO: This is overkill since it also forgets about SALU delays. State = DelayState(); } else if (Type != OTHER) { DelayInfo Delay; @@ -360,11 +368,11 @@ public: // ignore this operand. if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied()) continue; - for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) { - auto It = State.find(*UI); + for (MCRegUnit Unit : TRI->regunits(Op.getReg())) { + auto It = State.find(Unit); if (It != State.end()) { Delay.merge(It->second); - State.erase(*UI); + State.erase(Unit); } } } @@ -380,9 +388,9 @@ public: // TODO: Scan implicit defs too? for (const auto &Op : MI.defs()) { unsigned Latency = SchedModel.computeOperandLatency( - &MI, MI.getOperandNo(&Op), nullptr, 0); - for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) - State[*UI] = DelayInfo(Type, Latency); + &MI, Op.getOperandNo(), nullptr, 0); + for (MCRegUnit Unit : TRI->regunits(Op.getReg())) + State[Unit] = DelayInfo(Type, Latency); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 62c3eec41836..3c399e497227 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -23,6 +23,7 @@ #include <optional> using namespace llvm; +using namespace llvm::PatternMatch; #define DEBUG_TYPE "AMDGPUtti" @@ -328,7 +329,8 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, }); } -bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, +bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I, + const Value *Op0, const Value *Op1, InstCombiner &IC) const { // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or // infinity, gives +0.0. If we can prove we don't have one of the special @@ -340,15 +342,72 @@ bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, // One operand is not zero or infinity or NaN. return true; } + auto *TLI = &IC.getTargetLibraryInfo(); - if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) && - isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) { + if (isKnownNeverInfOrNaN(Op0, IC.getDataLayout(), TLI, 0, + &IC.getAssumptionCache(), &I, + &IC.getDominatorTree()) && + isKnownNeverInfOrNaN(Op1, IC.getDataLayout(), TLI, 0, + &IC.getAssumptionCache(), &I, + &IC.getDominatorTree())) { // Neither operand is infinity or NaN. return true; } return false; } +/// Match an fpext from half to float, or a constant we can convert. +static bool matchFPExtFromF16(Value *Arg, Value *&FPExtSrc) { + if (match(Arg, m_OneUse(m_FPExt(m_Value(FPExtSrc))))) + return FPExtSrc->getType()->isHalfTy(); + + ConstantFP *CFP; + if (match(Arg, m_ConstantFP(CFP))) { + bool LosesInfo; + APFloat Val(CFP->getValueAPF()); + Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo); + if (LosesInfo) + return false; + + FPExtSrc = ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val); + return true; + } + + return false; +} + +// Trim all zero components from the end of the vector \p UseV and return +// an appropriate bitset with known elements. +static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, + Instruction *I) { + auto *VTy = cast<FixedVectorType>(UseV->getType()); + unsigned VWidth = VTy->getNumElements(); + APInt DemandedElts = APInt::getAllOnes(VWidth); + + for (int i = VWidth - 1; i > 0; --i) { + auto *Elt = findScalarElement(UseV, i); + if (!Elt) + break; + + if (auto *ConstElt = dyn_cast<Constant>(Elt)) { + if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt)) + break; + } else { + break; + } + + DemandedElts.clearBit(i); + } + + return DemandedElts; +} + +static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, + IntrinsicInst &II, + APInt DemandedElts, + int DMaskIdx = -1, + bool IsLoad = true); + std::optional<Instruction *> GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Intrinsic::ID IID = II.getIntrinsicID(); @@ -393,6 +452,54 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { break; } + case Intrinsic::amdgcn_log: + case Intrinsic::amdgcn_exp2: { + const bool IsLog = IID == Intrinsic::amdgcn_log; + const bool IsExp = IID == Intrinsic::amdgcn_exp2; + Value *Src = II.getArgOperand(0); + Type *Ty = II.getType(); + + if (isa<PoisonValue>(Src)) + return IC.replaceInstUsesWith(II, Src); + + if (IC.getSimplifyQuery().isUndefValue(Src)) + return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty)); + + if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) { + if (C->isInfinity()) { + // exp2(+inf) -> +inf + // log2(+inf) -> +inf + if (!C->isNegative()) + return IC.replaceInstUsesWith(II, C); + + // exp2(-inf) -> 0 + if (IsExp && C->isNegative()) + return IC.replaceInstUsesWith(II, ConstantFP::getZero(Ty)); + } + + if (II.isStrictFP()) + break; + + if (C->isNaN()) { + Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet()); + return IC.replaceInstUsesWith(II, Quieted); + } + + // f32 instruction doesn't handle denormals, f16 does. + if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) { + Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true) + : ConstantFP::get(Ty, 1.0); + return IC.replaceInstUsesWith(II, FoldedValue); + } + + if (IsLog && C->isNegative()) + return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty)); + + // TODO: Full constant folding matching hardware behavior. + } + + break; + } case Intrinsic::amdgcn_frexp_mant: case Intrinsic::amdgcn_frexp_exp: { Value *Src = II.getArgOperand(0); @@ -423,85 +530,31 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Value *Src0 = II.getArgOperand(0); Value *Src1 = II.getArgOperand(1); const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); - if (!CMask) { - if (isa<UndefValue>(Src0)) { - return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); - } + if (CMask) { + II.setCalledOperand(Intrinsic::getDeclaration( + II.getModule(), Intrinsic::is_fpclass, Src0->getType())); - if (isa<UndefValue>(Src1)) { - return IC.replaceInstUsesWith(II, - ConstantInt::get(II.getType(), false)); - } - break; + // Clamp any excess bits, as they're illegal for the generic intrinsic. + II.setArgOperand(1, ConstantInt::get(Src1->getType(), + CMask->getZExtValue() & fcAllFlags)); + return &II; } - uint32_t Mask = CMask->getZExtValue(); - - // If all tests are made, it doesn't matter what the value is. - if ((Mask & fcAllFlags) == fcAllFlags) { - return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true)); - } + // Propagate poison. + if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1)) + return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType())); - if ((Mask & fcAllFlags) == 0) { + // llvm.amdgcn.class(_, undef) -> false + if (IC.getSimplifyQuery().isUndefValue(Src1)) return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false)); - } - - if (Mask == fcNan && !II.isStrictFP()) { - // Equivalent of isnan. Replace with standard fcmp. - Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0); - FCmp->takeName(&II); - return IC.replaceInstUsesWith(II, FCmp); - } - - if (Mask == fcZero && !II.isStrictFP()) { - // Equivalent of == 0. - Value *FCmp = - IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0)); - FCmp->takeName(&II); - return IC.replaceInstUsesWith(II, FCmp); + // llvm.amdgcn.class(undef, mask) -> mask != 0 + if (IC.getSimplifyQuery().isUndefValue(Src0)) { + Value *CmpMask = IC.Builder.CreateICmpNE( + Src1, ConstantInt::getNullValue(Src1->getType())); + return IC.replaceInstUsesWith(II, CmpMask); } - - // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other - if ((Mask & fcNan) && isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) { - return IC.replaceOperand( - II, 1, ConstantInt::get(Src1->getType(), Mask & ~fcNan)); - } - - const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); - if (!CVal) { - if (isa<UndefValue>(Src0)) { - return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); - } - - // Clamp mask to used bits - if ((Mask & fcAllFlags) != Mask) { - CallInst *NewCall = IC.Builder.CreateCall( - II.getCalledFunction(), - {Src0, ConstantInt::get(Src1->getType(), Mask & fcAllFlags)}); - - NewCall->takeName(&II); - return IC.replaceInstUsesWith(II, NewCall); - } - - break; - } - - const APFloat &Val = CVal->getValueAPF(); - - bool Result = - ((Mask & fcSNan) && Val.isNaN() && Val.isSignaling()) || - ((Mask & fcQNan) && Val.isNaN() && !Val.isSignaling()) || - ((Mask & fcNegInf) && Val.isInfinity() && Val.isNegative()) || - ((Mask & fcNegNormal) && Val.isNormal() && Val.isNegative()) || - ((Mask & fcNegSubnormal) && Val.isDenormal() && Val.isNegative()) || - ((Mask & fcNegZero) && Val.isZero() && Val.isNegative()) || - ((Mask & fcPosZero) && Val.isZero() && !Val.isNegative()) || - ((Mask & fcPosSubnormal) && Val.isDenormal() && !Val.isNegative()) || - ((Mask & fcPosNormal) && Val.isNormal() && !Val.isNegative()) || - ((Mask & fcPosInf) && Val.isInfinity() && !Val.isNegative()); - - return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); + break; } case Intrinsic::amdgcn_cvt_pkrtz: { Value *Src0 = II.getArgOperand(0); @@ -695,6 +748,20 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { } } + if (!ST->hasMed3_16()) + break; + + Value *X, *Y, *Z; + + // Repeat floating-point width reduction done for minnum/maxnum. + // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z)) + if (matchFPExtFromF16(Src0, X) && matchFPExtFromF16(Src1, Y) && + matchFPExtFromF16(Src2, Z)) { + Value *NewCall = IC.Builder.CreateIntrinsic(IID, {X->getType()}, + {X, Y, Z}, &II, II.getName()); + return new FPExtInst(NewCall, II.getType()); + } + break; } case Intrinsic::amdgcn_icmp: @@ -835,31 +902,18 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { break; } + case Intrinsic::amdgcn_mbcnt_hi: { + // exec_hi is all 0, so this is just a copy. + if (ST->isWave32()) + return IC.replaceInstUsesWith(II, II.getArgOperand(1)); + break; + } case Intrinsic::amdgcn_ballot: { if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) { if (Src->isZero()) { // amdgcn.ballot(i1 0) is zero. return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); } - - if (Src->isOne()) { - // amdgcn.ballot(i1 1) is exec. - const char *RegName = "exec"; - if (II.getType()->isIntegerTy(32)) - RegName = "exec_lo"; - else if (!II.getType()->isIntegerTy(64)) - break; - - Function *NewF = Intrinsic::getDeclaration( - II.getModule(), Intrinsic::read_register, II.getType()); - Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)}; - MDNode *MD = MDNode::get(II.getContext(), MDArgs); - Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; - CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); - NewCall->addFnAttr(Attribute::Convergent); - NewCall->takeName(&II); - return IC.replaceInstUsesWith(II, NewCall); - } } break; } @@ -981,13 +1035,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (II.isStrictFP()) break; - if (C && C->isNaN()) { - // FIXME: We just need to make the nan quiet here, but that's unavailable - // on APFloat, only IEEEfloat - auto *Quieted = - ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven)); - return IC.replaceInstUsesWith(II, Quieted); - } + if (C && C->isNaN()) + return IC.replaceInstUsesWith(II, ConstantFP::get(Ty, C->makeQuiet())); // ldexp(x, 0) -> x // ldexp(x, undef) -> x @@ -1006,11 +1055,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // TODO: Move to InstSimplify? if (match(Op0, PatternMatch::m_AnyZeroFP()) || match(Op1, PatternMatch::m_AnyZeroFP())) - return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType())); + return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType())); // If we can prove we don't have one of the special cases then we can use a // normal fmul instruction instead. - if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { + if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) { auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II); FMul->takeName(&II); return IC.replaceInstUsesWith(II, FMul); @@ -1029,7 +1078,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { match(Op1, PatternMatch::m_AnyZeroFP())) { // It's tempting to just return Op2 here, but that would give the wrong // result if Op2 was -0.0. - auto *Zero = ConstantFP::getNullValue(II.getType()); + auto *Zero = ConstantFP::getZero(II.getType()); auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II); FAdd->takeName(&II); return IC.replaceInstUsesWith(II, FAdd); @@ -1037,7 +1086,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // If we can prove we don't have one of the special cases then we can use a // normal fma instead. - if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { + if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) { II.setCalledOperand(Intrinsic::getDeclaration( II.getModule(), Intrinsic::fma, II.getType())); return &II; @@ -1053,26 +1102,62 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType())); break; } - default: { - if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = - AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { - return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); + case Intrinsic::amdgcn_buffer_store_format: + case Intrinsic::amdgcn_raw_buffer_store_format: + case Intrinsic::amdgcn_struct_buffer_store_format: + case Intrinsic::amdgcn_raw_tbuffer_store: + case Intrinsic::amdgcn_struct_tbuffer_store: + case Intrinsic::amdgcn_tbuffer_store: + case Intrinsic::amdgcn_image_store_1d: + case Intrinsic::amdgcn_image_store_1darray: + case Intrinsic::amdgcn_image_store_2d: + case Intrinsic::amdgcn_image_store_2darray: + case Intrinsic::amdgcn_image_store_2darraymsaa: + case Intrinsic::amdgcn_image_store_2dmsaa: + case Intrinsic::amdgcn_image_store_3d: + case Intrinsic::amdgcn_image_store_cube: + case Intrinsic::amdgcn_image_store_mip_1d: + case Intrinsic::amdgcn_image_store_mip_1darray: + case Intrinsic::amdgcn_image_store_mip_2d: + case Intrinsic::amdgcn_image_store_mip_2darray: + case Intrinsic::amdgcn_image_store_mip_3d: + case Intrinsic::amdgcn_image_store_mip_cube: { + if (!isa<FixedVectorType>(II.getArgOperand(0)->getType())) + break; + + APInt DemandedElts = + trimTrailingZerosInVector(IC, II.getArgOperand(0), &II); + + int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1; + if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx, + false)) { + return IC.eraseInstFromFunction(II); } + + break; + } } + if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = + AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { + return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); } return std::nullopt; } /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. /// +/// The result of simplifying amdgcn image and buffer store intrinsics is updating +/// definitions of the intrinsics vector argument, not Uses of the result like +/// image and buffer loads. /// Note: This only supports non-TFE/LWE image intrinsic calls; those have /// struct returns. static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, - int DMaskIdx = -1) { + int DMaskIdx, bool IsLoad) { - auto *IIVTy = cast<FixedVectorType>(II.getType()); + auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType() + : II.getOperand(0)->getType()); unsigned VWidth = IIVTy->getNumElements(); if (VWidth == 1) return nullptr; @@ -1088,7 +1173,7 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, // Buffer case. const unsigned ActiveBits = DemandedElts.getActiveBits(); - const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); + const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero(); // Start assuming the prefix of elements is demanded, but possibly clear // some other bits if there are trailing zeros (unused components at front) @@ -1101,6 +1186,7 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, unsigned OffsetIdx; switch (II.getIntrinsicID()) { case Intrinsic::amdgcn_raw_buffer_load: + case Intrinsic::amdgcn_raw_ptr_buffer_load: OffsetIdx = 1; break; case Intrinsic::amdgcn_s_buffer_load: @@ -1113,6 +1199,7 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, OffsetIdx = 1; break; case Intrinsic::amdgcn_struct_buffer_load: + case Intrinsic::amdgcn_struct_ptr_buffer_load: OffsetIdx = 2; break; default: @@ -1143,13 +1230,13 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1; unsigned NewDMaskVal = 0; - unsigned OrigLoadIdx = 0; + unsigned OrigLdStIdx = 0; for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { const unsigned Bit = 1 << SrcIdx; if (!!(DMaskVal & Bit)) { - if (!!DemandedElts[OrigLoadIdx]) + if (!!DemandedElts[OrigLdStIdx]) NewDMaskVal |= Bit; - OrigLoadIdx++; + OrigLdStIdx++; } } @@ -1157,7 +1244,7 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); } - unsigned NewNumElts = DemandedElts.countPopulation(); + unsigned NewNumElts = DemandedElts.popcount(); if (!NewNumElts) return UndefValue::get(IIVTy); @@ -1177,29 +1264,45 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); OverloadTys[0] = NewTy; + if (!IsLoad) { + SmallVector<int, 8> EltMask; + for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx) + if (DemandedElts[OrigStoreIdx]) + EltMask.push_back(OrigStoreIdx); + + if (NewNumElts == 1) + Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]); + else + Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask); + } + Function *NewIntrin = Intrinsic::getDeclaration( II.getModule(), II.getIntrinsicID(), OverloadTys); CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); NewCall->takeName(&II); NewCall->copyMetadata(II); - if (NewNumElts == 1) { - return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall, - DemandedElts.countTrailingZeros()); - } + if (IsLoad) { + if (NewNumElts == 1) { + return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall, + DemandedElts.countr_zero()); + } - SmallVector<int, 8> EltMask; - unsigned NewLoadIdx = 0; - for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { - if (!!DemandedElts[OrigLoadIdx]) - EltMask.push_back(NewLoadIdx++); - else - EltMask.push_back(NewNumElts); - } + SmallVector<int, 8> EltMask; + unsigned NewLoadIdx = 0; + for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { + if (!!DemandedElts[OrigLoadIdx]) + EltMask.push_back(NewLoadIdx++); + else + EltMask.push_back(NewNumElts); + } + + auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); - Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); + return Shuffle; + } - return Shuffle; + return NewCall; } std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( @@ -1211,12 +1314,18 @@ std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( case Intrinsic::amdgcn_buffer_load: case Intrinsic::amdgcn_buffer_load_format: case Intrinsic::amdgcn_raw_buffer_load: + case Intrinsic::amdgcn_raw_ptr_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: + case Intrinsic::amdgcn_raw_ptr_buffer_load_format: case Intrinsic::amdgcn_raw_tbuffer_load: + case Intrinsic::amdgcn_raw_ptr_tbuffer_load: case Intrinsic::amdgcn_s_buffer_load: case Intrinsic::amdgcn_struct_buffer_load: + case Intrinsic::amdgcn_struct_ptr_buffer_load: case Intrinsic::amdgcn_struct_buffer_load_format: + case Intrinsic::amdgcn_struct_ptr_buffer_load_format: case Intrinsic::amdgcn_struct_tbuffer_load: + case Intrinsic::amdgcn_struct_ptr_tbuffer_load: case Intrinsic::amdgcn_tbuffer_load: return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); default: { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 15b7f971f09c..b69cae0c73b3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -18,10 +18,6 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3> ]>; -def AMDGPULdExpOp : SDTypeProfile<1, 2, - [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] ->; - def AMDGPUFPClassOp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>] >; @@ -43,6 +39,7 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4, [SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<4>] >; +def ImmOp : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def AMDGPUIfOp : SDTypeProfile<1, 2, @@ -85,9 +82,16 @@ def AMDGPUcall : SDNode<"AMDGPUISD::CALL", SDNPVariadic] >; -def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", - SDTypeProfile<0, 3, [SDTCisPtrTy<0>]>, - [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] +def AMDGPUTCReturnTP : SDTypeProfile<0, 3, [ + SDTCisPtrTy<0> +]>; + +def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", AMDGPUTCReturnTP, +[SDNPHasChain, SDNPOptInGlue, SDNPVariadic] +>; + +def AMDGPUtc_return_gfx: SDNode<"AMDGPUISD::TC_RETURN_GFX", AMDGPUTCReturnTP, +[SDNPHasChain, SDNPOptInGlue, SDNPVariadic] >; def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP", @@ -111,6 +115,12 @@ def AMDGPUfract_impl : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; // out = 1.0 / a def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; +// v_log_f32, which is log2 +def AMDGPUlog_impl : SDNode<"AMDGPUISD::LOG", SDTFPUnaryOp>; + +// v_exp_f32, which is exp2 +def AMDGPUexp_impl : SDNode<"AMDGPUISD::EXP", SDTFPUnaryOp>; + // out = 1.0 / sqrt(a) def AMDGPUrsq_impl : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; @@ -121,8 +131,6 @@ def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) result clamped to +/- max_float. def AMDGPUrsq_clamp_impl : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>; -def AMDGPUldexp_impl : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; - def AMDGPUpkrtz_f16_f32_impl : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>; def AMDGPUpknorm_i16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>; def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>; @@ -351,11 +359,13 @@ def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai //===----------------------------------------------------------------------===// def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone, [SDNPHasChain, SDNPOptInGlue]>; +def AMDGPUendpgm_trap : SDNode<"AMDGPUISD::ENDPGM_TRAP", SDTNone, + [SDNPHasChain]>; def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; -def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, +def AMDGPUret_glue : SDNode<"AMDGPUISD::RET_GLUE", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] >; @@ -381,10 +391,15 @@ def AMDGPUcos : PatFrags<(ops node:$src), [(int_amdgcn_cos node:$src), (AMDGPUcos_impl node:$src)]>; def AMDGPUfract : PatFrags<(ops node:$src), [(int_amdgcn_fract node:$src), (AMDGPUfract_impl node:$src)]>; +def AMDGPUlog : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src), + (AMDGPUlog_impl node:$src)]>; +def AMDGPUlogf16 : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src), + (flog2 node:$src)]>; -def AMDGPUldexp : PatFrags<(ops node:$src0, node:$src1), - [(int_amdgcn_ldexp node:$src0, node:$src1), - (AMDGPUldexp_impl node:$src0, node:$src1)]>; +def AMDGPUexp : PatFrags<(ops node:$src), [(int_amdgcn_exp2 node:$src), + (AMDGPUexp_impl node:$src)]>; +def AMDGPUexpf16 : PatFrags<(ops node:$src), [(int_amdgcn_exp2 node:$src), + (fexp2 node:$src)]>; def AMDGPUfp_class : PatFrags<(ops node:$src0, node:$src1), [(int_amdgcn_class node:$src0, node:$src1), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 2639f1f45565..747f9fe2f8ae 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -19,8 +19,8 @@ #include "AMDGPUTargetMachine.h" #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" -#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -63,7 +63,7 @@ AMDGPUInstructionSelector::AMDGPUInstructionSelector( const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB, - CodeGenCoverage &CoverageInfo, + CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { MRI = &MF.getRegInfo(); @@ -523,60 +523,6 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const { return true; } -bool AMDGPUInstructionSelector::selectG_FMA_FMAD(MachineInstr &I) const { - assert(I.getOpcode() == AMDGPU::G_FMA || I.getOpcode() == AMDGPU::G_FMAD); - - // Try to manually select MAD_MIX/FMA_MIX. - Register Dst = I.getOperand(0).getReg(); - LLT ResultTy = MRI->getType(Dst); - bool IsFMA = I.getOpcode() == AMDGPU::G_FMA; - if (ResultTy != LLT::scalar(32) || - (IsFMA ? !Subtarget->hasFmaMixInsts() : !Subtarget->hasMadMixInsts())) - return false; - - // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand - // using the conversion from f16. - bool MatchedSrc0, MatchedSrc1, MatchedSrc2; - auto [Src0, Src0Mods] = - selectVOP3PMadMixModsImpl(I.getOperand(1), MatchedSrc0); - auto [Src1, Src1Mods] = - selectVOP3PMadMixModsImpl(I.getOperand(2), MatchedSrc1); - auto [Src2, Src2Mods] = - selectVOP3PMadMixModsImpl(I.getOperand(3), MatchedSrc2); - -#ifndef NDEBUG - const SIMachineFunctionInfo *MFI = - I.getMF()->getInfo<SIMachineFunctionInfo>(); - AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); - assert((IsFMA || !Mode.allFP32Denormals()) && - "fmad selected with denormals enabled"); -#endif - - // TODO: We can select this with f32 denormals enabled if all the sources are - // converted from f16 (in which case fmad isn't legal). - if (!MatchedSrc0 && !MatchedSrc1 && !MatchedSrc2) - return false; - - const unsigned OpC = IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32; - MachineInstr *MixInst = - BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpC), Dst) - .addImm(Src0Mods) - .addReg(copyToVGPRIfSrcFolded(Src0, Src0Mods, I.getOperand(1), &I)) - .addImm(Src1Mods) - .addReg(copyToVGPRIfSrcFolded(Src1, Src1Mods, I.getOperand(2), &I)) - .addImm(Src2Mods) - .addReg(copyToVGPRIfSrcFolded(Src2, Src2Mods, I.getOperand(3), &I)) - .addImm(0) - .addImm(0) - .addImm(0); - - if (!constrainSelectedInstRegOperands(*MixInst, TII, TRI, RBI)) - return false; - - I.eraseFromParent(); - return true; -} - bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const { MachineBasicBlock *BB = MI.getParent(); Register DstReg = MI.getOperand(0).getReg(); @@ -1100,6 +1046,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { return selectIntrinsicCmp(I); case Intrinsic::amdgcn_ballot: return selectBallot(I); + case Intrinsic::amdgcn_inverse_ballot: + return selectInverseBallot(I); case Intrinsic::amdgcn_reloc_constant: return selectRelocConstant(I); case Intrinsic::amdgcn_groupstaticsize: @@ -1343,27 +1291,26 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const { if (Opcode == -1) return false; - MachineInstr *SelectedMI; - if (CmpInst::isFPPredicate(Pred)) { - MachineOperand &LHS = I.getOperand(2); - MachineOperand &RHS = I.getOperand(3); - auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS); - auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS); - Register Src0Reg = - copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true); - Register Src1Reg = - copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true); - SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst) - .addImm(Src0Mods) - .addReg(Src0Reg) - .addImm(Src1Mods) - .addReg(Src1Reg) - .addImm(0); // clamp - } else { - SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst) - .add(I.getOperand(2)) - .add(I.getOperand(3)); - } + MachineInstrBuilder SelectedMI; + MachineOperand &LHS = I.getOperand(2); + MachineOperand &RHS = I.getOperand(3); + auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS); + auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS); + Register Src0Reg = + copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true); + Register Src1Reg = + copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true); + SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst); + if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers)) + SelectedMI.addImm(Src0Mods); + SelectedMI.addReg(Src0Reg); + if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers)) + SelectedMI.addImm(Src1Mods); + SelectedMI.addReg(Src1Reg); + if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp)) + SelectedMI.addImm(0); // clamp + if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) + SelectedMI.addImm(0); // op_sel RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI); if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI)) @@ -1379,28 +1326,56 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { Register DstReg = I.getOperand(0).getReg(); const unsigned Size = MRI->getType(DstReg).getSizeInBits(); const bool Is64 = Size == 64; + const bool IsWave32 = (STI.getWavefrontSize() == 32); - if (Size != STI.getWavefrontSize()) + // In the common case, the return type matches the wave size. + // However we also support emitting i64 ballots in wave32 mode. + if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32)) return false; std::optional<ValueAndVReg> Arg = getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI); + const auto BuildCopy = [&](Register SrcReg) { + if (Size == STI.getWavefrontSize()) { + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg) + .addReg(SrcReg); + return; + } + + // If emitting a i64 ballot in wave32, fill the upper bits with zeroes. + Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) + .addReg(SrcReg) + .addImm(AMDGPU::sub0) + .addReg(HiReg) + .addImm(AMDGPU::sub1); + }; + if (Arg) { const int64_t Value = Arg->Value.getSExtValue(); if (Value == 0) { unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); - } else if (Value == -1) { // all ones - Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO; - BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); - } else + } else if (Value == -1) // all ones + BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC); + else return false; - } else { - Register SrcReg = I.getOperand(2).getReg(); - BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); - } + } else + BuildCopy(I.getOperand(2).getReg()); + + I.eraseFromParent(); + return true; +} + +bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + const Register DstReg = I.getOperand(0).getReg(); + const Register MaskReg = I.getOperand(2).getReg(); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(MaskReg); I.eraseFromParent(); return true; } @@ -1635,7 +1610,7 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI, .addImm(0); } else { std::tie(BaseOffset, ImmOffset) = - AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KnownBits); + AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB); if (Readfirstlane) { // We have the constant offset now, so put the readfirstlane back on the @@ -1824,7 +1799,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( } // Set G16 opcode - if (IsG16 && !IsA16) { + if (Subtarget->hasG16() && IsG16) { const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); assert(G16MappingInfo); @@ -1859,7 +1834,10 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( // The legalizer preprocessed the intrinsic arguments. If we aren't using // NSA, these should have been packed into a single value in the first // address register - const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs; + const bool UseNSA = + NumVAddrRegs != 1 && + (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs + : NumVAddrDwords == NumVAddrRegs); if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) { LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n"); return false; @@ -1898,7 +1876,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, NumVDataDwords, NumVAddrDwords); } - assert(Opcode != -1); + if (Opcode == -1) + return false; auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode)) .cloneMemRefs(MI); @@ -2050,7 +2029,9 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( case Intrinsic::amdgcn_s_barrier: return selectSBarrier(I); case Intrinsic::amdgcn_raw_buffer_load_lds: + case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: case Intrinsic::amdgcn_struct_buffer_load_lds: + case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: return selectBufferLoadLds(I); case Intrinsic::amdgcn_global_load_lds: return selectGlobalLoadLds(I); @@ -2137,7 +2118,7 @@ static int sizeToSubRegIndex(unsigned Size) { return AMDGPU::sub0; if (Size > 256) return -1; - return sizeToSubRegIndex(PowerOf2Ceil(Size)); + return sizeToSubRegIndex(llvm::bit_ceil(Size)); } } @@ -2801,7 +2782,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const { // Try to avoid emitting a bit operation when we only need to touch half of // the 64-bit pointer. - APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zext(64); + APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64); const APInt MaskHi32 = APInt::getHighBitsSet(64, 32); const APInt MaskLo32 = APInt::getLowBitsSet(64, 32); @@ -2953,7 +2934,7 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT( unsigned SubReg; std::tie(IdxReg, SubReg) = computeIndirectRegIndex( - *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KnownBits); + *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB); if (SrcRB->getID() == AMDGPU::SGPRRegBankID) { if (DstTy.getSizeInBits() != 32 && !Is64) @@ -3033,8 +3014,8 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( return false; unsigned SubReg; - std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, - ValSize / 8, *KnownBits); + std::tie(IdxReg, SubReg) = + computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB); const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && STI.useVGPRIndexMode(); @@ -3402,11 +3383,6 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { return selectG_FABS(I); case TargetOpcode::G_EXTRACT: return selectG_EXTRACT(I); - case TargetOpcode::G_FMA: - case TargetOpcode::G_FMAD: - if (selectG_FMA_FMAD(I)) - return true; - return selectImpl(I, *CoverageInfo); case TargetOpcode::G_MERGE_VALUES: case TargetOpcode::G_CONCAT_VECTORS: return selectG_MERGE_VALUES(I); @@ -3446,9 +3422,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_ATOMICRMW_MAX: case TargetOpcode::G_ATOMICRMW_UMIN: case TargetOpcode::G_ATOMICRMW_UMAX: + case TargetOpcode::G_ATOMICRMW_UINC_WRAP: + case TargetOpcode::G_ATOMICRMW_UDEC_WRAP: case TargetOpcode::G_ATOMICRMW_FADD: - case AMDGPU::G_AMDGPU_ATOMIC_INC: - case AMDGPU::G_AMDGPU_ATOMIC_DEC: case AMDGPU::G_AMDGPU_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_ATOMIC_FMAX: return selectG_LOAD_STORE_ATOMICRMW(I); @@ -3460,7 +3436,11 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) { case TargetOpcode::G_ZEXT: case TargetOpcode::G_ANYEXT: case TargetOpcode::G_SEXT_INREG: - if (selectImpl(I, *CoverageInfo)) + // This is a workaround. For extension from type i1, `selectImpl()` uses + // patterns from TD file and generates an illegal VGPR to SGPR COPY as type + // i1 can only be hold in a SGPR class. + if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) && + selectImpl(I, *CoverageInfo)) return true; return selectG_SZA_EXT(I); case TargetOpcode::G_BRCOND: @@ -3506,8 +3486,10 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { } -std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl( - MachineOperand &Root, bool AllowAbs, bool OpSel) const { +std::pair<Register, unsigned> +AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root, + bool IsCanonicalizing, + bool AllowAbs, bool OpSel) const { Register Src = Root.getReg(); unsigned Mods = 0; MachineInstr *MI = getDefIgnoringCopies(Src, *MRI); @@ -3516,6 +3498,15 @@ std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl( Src = MI->getOperand(1).getReg(); Mods |= SISrcMods::NEG; MI = getDefIgnoringCopies(Src, *MRI); + } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) { + // Fold fsub [+-]0 into fneg. This may not have folded depending on the + // denormal mode, but we're implicitly canonicalizing in a source operand. + const ConstantFP *LHS = + getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI); + if (LHS && LHS->isZero()) { + Mods |= SISrcMods::NEG; + Src = MI->getOperand(2).getReg(); + } } if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) { @@ -3578,7 +3569,9 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root, + /*IsCanonicalizing=*/true, + /*AllowAbs=*/false); return {{ [=](MachineInstrBuilder &MIB) { @@ -3614,10 +3607,26 @@ AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing( + MachineOperand &Root) const { + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false); + + return {{ + [=](MachineInstrBuilder &MIB) { + MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB)); + }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const { Register Src; unsigned Mods; - std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false); + std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true, + /*AllowAbs=*/false); return {{ [=](MachineInstrBuilder &MIB) { @@ -3653,6 +3662,8 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl( MI = MRI.getVRegDef(Src); } + // TODO: Handle G_FSUB 0 as fneg + // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard() @@ -3739,8 +3750,9 @@ AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const { Register Src; unsigned Mods; std::tie(Src, Mods) = selectVOP3ModsImpl(Root, - /* AllowAbs */ false, - /* OpSel */ false); + /*IsCanonicalizing=*/true, + /*AllowAbs=*/false, + /*OpSel=*/false); return {{ [=](MachineInstrBuilder &MIB) { @@ -3756,8 +3768,9 @@ AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const { Register Src; unsigned Mods; std::tie(Src, Mods) = selectVOP3ModsImpl(Root, - /* AllowAbs */ false, - /* OpSel */ true); + /*IsCanonicalizing=*/true, + /*AllowAbs=*/false, + /*OpSel=*/true); return {{ [=](MachineInstrBuilder &MIB) { @@ -3903,7 +3916,7 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root, int64_t ConstOffset; std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Root.getReg(), *MRI); - if (ConstOffset == 0) + if (ConstOffset == 0 || !isFlatScratchBaseLegal(PtrBase, FlatVariant)) return Default; unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace(); @@ -4066,7 +4079,7 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { // possible. std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI); - if (ConstOffset != 0 && + if (ConstOffset != 0 && isFlatScratchBaseLegal(PtrBase) && TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch)) { Addr = PtrBase; @@ -4122,9 +4135,9 @@ bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug( // The bug affects the swizzling of SVS accesses if there is any carry out // from the two low order bits (i.e. from bit 1 into bit 2) when adding // voffset to (soffset + inst_offset). - auto VKnown = KnownBits->getKnownBits(VAddr); + auto VKnown = KB->getKnownBits(VAddr); auto SKnown = KnownBits::computeForAddSub( - true, false, KnownBits->getKnownBits(SAddr), + true, false, KB->getKnownBits(SAddr), KnownBits::makeConstant(APInt(32, ImmOffset))); uint64_t VMax = VKnown.getMaxValue().getZExtValue(); uint64_t SMax = SKnown.getMaxValue().getZExtValue(); @@ -4159,6 +4172,9 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { Register LHS = AddrDef->MI->getOperand(1).getReg(); auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); + if (!isFlatScratchBaseLegal(LHS) || !isFlatScratchBaseLegal(RHS)) + return std::nullopt; + if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset)) return std::nullopt; @@ -4195,9 +4211,10 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { // TODO: Should this be inside the render function? The iterator seems to // move. + const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(); BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), HighBits) - .addImm(Offset & ~4095); + .addImm(Offset & ~MaxOffset); return {{[=](MachineInstrBuilder &MIB) { // rsrc MIB.addReg(Info->getScratchRSrcReg()); @@ -4211,7 +4228,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { MIB.addImm(0); }, [=](MachineInstrBuilder &MIB) { // offset - MIB.addImm(Offset & 4095); + MIB.addImm(Offset & MaxOffset); }}}; } @@ -4228,7 +4245,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { if (ConstOffset != 0) { if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) && (!STI.privateMemoryResourceIsRangeChecked() || - KnownBits->signBitIsZero(PtrBase))) { + KB->signBitIsZero(PtrBase))) { const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase); if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX) FI = PtrBaseDef->getOperand(1).getIndex(); @@ -4270,7 +4287,7 @@ bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base, // On Southern Islands instruction with a negative base value and an offset // don't seem to work. - return KnownBits->signBitIsZero(Base); + return KB->signBitIsZero(Base); } bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0, @@ -4286,7 +4303,17 @@ bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0, // On Southern Islands instruction with a negative base value and an offset // don't seem to work. - return KnownBits->signBitIsZero(Base); + return KB->signBitIsZero(Base); +} + +bool AMDGPUInstructionSelector::isFlatScratchBaseLegal( + Register Base, uint64_t FlatVariant) const { + if (FlatVariant != SIInstrFlags::FlatScratch) + return true; + + // When value in 32-bit Base can be negative calculate scratch offset using + // 32-bit add instruction, otherwise use Base(unsigned) + offset. + return KB->signBitIsZero(Base); } bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI, @@ -4298,12 +4325,11 @@ bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI, if (!RHS) return false; - if (RHS->countTrailingOnes() >= ShAmtBits) + if (RHS->countr_one() >= ShAmtBits) return true; - const APInt &LHSKnownZeros = - KnownBits->getKnownZeroes(MI.getOperand(1).getReg()); - return (LHSKnownZeros | *RHS).countTrailingOnes() >= ShAmtBits; + const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg()); + return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits; } // Return the wave level SGPR base address if this is a wave address. @@ -4746,64 +4772,6 @@ AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const { }}; } -InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const { - Register VAddr; - Register RSrcReg; - Register SOffset; - int64_t Offset = 0; - - if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset)) - return {}; - - // FIXME: Use defaulted operands for trailing 0s and remove from the complex - // pattern. - return {{ - [=](MachineInstrBuilder &MIB) { // rsrc - MIB.addReg(RSrcReg); - }, - [=](MachineInstrBuilder &MIB) { // vaddr - MIB.addReg(VAddr); - }, - [=](MachineInstrBuilder &MIB) { // soffset - if (SOffset) - MIB.addReg(SOffset); - else - MIB.addImm(0); - }, - [=](MachineInstrBuilder &MIB) { // offset - MIB.addImm(Offset); - }, - [=](MachineInstrBuilder &MIB) { - MIB.addImm(AMDGPU::CPol::GLC); // cpol - } - }}; -} - -InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const { - Register RSrcReg; - Register SOffset; - int64_t Offset = 0; - - if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset)) - return {}; - - return {{ - [=](MachineInstrBuilder &MIB) { // rsrc - MIB.addReg(RSrcReg); - }, - [=](MachineInstrBuilder &MIB) { // soffset - if (SOffset) - MIB.addReg(SOffset); - else - MIB.addImm(0); - }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset - [=](MachineInstrBuilder &MIB) { MIB.addImm(AMDGPU::CPol::GLC); } // cpol - }}; -} - /// Get an immediate that must be 32-bits, and treated as zero extended. static std::optional<uint64_t> getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) { @@ -4851,7 +4819,7 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const { Register SOffset; unsigned Offset; std::tie(SOffset, Offset) = - AMDGPU::getBaseWithConstantOffset(*MRI, Root.getReg(), KnownBits); + AMDGPU::getBaseWithConstantOffset(*MRI, Root.getReg(), KB); if (!SOffset) return std::nullopt; @@ -4984,6 +4952,22 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root, } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PMadMixModsExt( + MachineOperand &Root) const { + Register Src; + unsigned Mods; + bool Matched; + std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched); + if (!Matched) + return {}; + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const { Register Src; unsigned Mods; @@ -5031,7 +5015,7 @@ void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB, int OpIdx) const { assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && "Expected G_CONSTANT"); - MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation()); + MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount()); } /// This only really exists to satisfy DAG type checking machinery, so is a @@ -5042,6 +5026,13 @@ void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB, MIB.addImm(MI.getOperand(OpIdx).getImm()); } +void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB, + const MachineInstr &MI, + int OpIdx) const { + assert(OpIdx >= 0 && "expected to match an immediate operand"); + MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0); +} + void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 33a01ed0a1ce..243ff72e2979 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H +#include "SIDefines.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" #include "llvm/IR/InstrTypes.h" @@ -58,7 +59,7 @@ public: static const char *getName(); void setupMF(MachineFunction &MF, GISelKnownBits *KB, - CodeGenCoverage &CoverageInfo, ProfileSummaryInfo *PSI, + CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override; private: @@ -111,6 +112,7 @@ private: bool selectDivScale(MachineInstr &MI) const; bool selectIntrinsicCmp(MachineInstr &MI) const; bool selectBallot(MachineInstr &I) const; + bool selectInverseBallot(MachineInstr &I) const; bool selectRelocConstant(MachineInstr &I) const; bool selectGroupStaticSize(MachineInstr &I) const; bool selectReturnAddress(MachineInstr &I) const; @@ -146,9 +148,10 @@ private: bool selectSMFMACIntrin(MachineInstr &I) const; bool selectWaveAddress(MachineInstr &I) const; - std::pair<Register, unsigned> - selectVOP3ModsImpl(MachineOperand &Root, bool AllowAbs = true, - bool OpSel = false) const; + std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root, + bool IsCanonicalizing = true, + bool AllowAbs = true, + bool OpSel = false) const; Register copyToVGPRIfSrcFolded(Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt, @@ -169,6 +172,8 @@ private: InstructionSelector::ComplexRendererFns selectVOP3Mods(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns + selectVOP3ModsNonCanonicalizing(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectVOP3BMods(MachineOperand &Root) const; ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const; @@ -236,6 +241,8 @@ private: bool isDSOffsetLegal(Register Base, int64_t Offset) const; bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1, unsigned Size) const; + bool isFlatScratchBaseLegal( + Register Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const; std::pair<Register, unsigned> selectDS1Addr1OffsetImpl(MachineOperand &Root) const; @@ -285,18 +292,13 @@ private: InstructionSelector::ComplexRendererFns selectMUBUFOffset(MachineOperand &Root) const; - InstructionSelector::ComplexRendererFns - selectMUBUFOffsetAtomic(MachineOperand &Root) const; - - InstructionSelector::ComplexRendererFns - selectMUBUFAddr64Atomic(MachineOperand &Root) const; - ComplexRendererFns selectSMRDBufferImm(MachineOperand &Root) const; ComplexRendererFns selectSMRDBufferImm32(MachineOperand &Root) const; ComplexRendererFns selectSMRDBufferSgprImm(MachineOperand &Root) const; std::pair<Register, unsigned> selectVOP3PMadMixModsImpl(MachineOperand &Root, bool &Matched) const; + ComplexRendererFns selectVOP3PMadMixModsExt(MachineOperand &Root) const; ComplexRendererFns selectVOP3PMadMixMods(MachineOperand &Root) const; void renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI, @@ -305,6 +307,9 @@ private: void renderTruncTImm(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; + void renderOpSelTImm(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx) const; + void renderNegateImm(MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 22b327279211..2305097e3f94 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -104,15 +104,18 @@ class PredicateControl { } class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>, - PredicateControl; + PredicateControl, GISelFlags; + +let GIIgnoreCopies = 1 in +class AMDGPUPatIgnoreCopies<dag pattern, dag result> : AMDGPUPat<pattern, result>; let RecomputePerFunction = 1 in { -def FP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">; -def FP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()">; -def FP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">; -def NoFP16Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">; -def NoFP32Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()">; -def NoFP64Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">; +def FP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals != DenormalMode::getPreserveSign()">; +def FP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals != DenormalMode::getPreserveSign()">; +def FP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals != DenormalMode::getPreserveSign()">; +def NoFP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">; +def NoFP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals == DenormalMode::getPreserveSign()">; +def NoFP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">; def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; } @@ -120,37 +123,45 @@ def FMA : Predicate<"Subtarget->hasFMA()">; def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>; -def u16ImmTarget : AsmOperandClass { - let Name = "U16Imm"; - let RenderMethod = "addImmOperands"; -} +def i1imm_0 : OperandWithDefaultOps<i1, (ops (i1 0))>; -def s16ImmTarget : AsmOperandClass { - let Name = "S16Imm"; +class CustomOperandClass<string name, bit optional, string parserMethod, + string defaultMethod> + : AsmOperandClass { + let Name = name; + let PredicateMethod = "is"#name; + let ParserMethod = parserMethod; let RenderMethod = "addImmOperands"; + let IsOptional = optional; + let DefaultMethod = defaultMethod; } -let OperandType = "OPERAND_IMMEDIATE" in { - -def u32imm : Operand<i32> { - let PrintMethod = "printU32ImmOperand"; -} - -def u16imm : Operand<i16> { - let PrintMethod = "printU16ImmOperand"; - let ParserMatchClass = u16ImmTarget; +class CustomOperandProps<bit optional = 0, string name = NAME> { + string ImmTy = "ImmTy"#name; + string ParserMethod = "parse"#name; + string DefaultValue = "0"; + string DefaultMethod = "[this]() { return "# + "AMDGPUOperand::CreateImm(this, "#DefaultValue#", SMLoc(), "# + "AMDGPUOperand::"#ImmTy#"); }"; + string PrintMethod = "print"#name; + AsmOperandClass ParserMatchClass = + CustomOperandClass<name, optional, ParserMethod, DefaultMethod>; + string OperandType = "OPERAND_IMMEDIATE"; } -def s16imm : Operand<i16> { - let PrintMethod = "printU16ImmOperand"; - let ParserMatchClass = s16ImmTarget; -} +class CustomOperand<ValueType type, bit optional = 0, string name = NAME> + : Operand<type>, CustomOperandProps<optional, name>; -def u8imm : Operand<i8> { - let PrintMethod = "printU8ImmOperand"; +class ImmOperand<ValueType type, string name = NAME, bit optional = 0, + string printer = "print"#name> + : CustomOperand<type, optional, name> { + let ImmTy = "ImmTyNone"; + let ParserMethod = ""; + let PrintMethod = printer; } -} // End OperandType = "OPERAND_IMMEDIATE" +def s16imm : ImmOperand<i16, "S16Imm", 0, "printU16ImmOperand">; +def u16imm : ImmOperand<i16, "U16Imm", 0, "printU16ImmOperand">; //===--------------------------------------------------------------------===// // Custom Operands @@ -210,6 +221,12 @@ class is_canonicalized<SDPatternOperator op> : PatFrag< }]; } +class FoldTernaryOpPat<SDPatternOperator op1, SDPatternOperator op2> : PatFrag< + (ops node:$src0, node:$src1, node:$src2), + (op2 (op1 node:$src0, node:$src1), node:$src2) +>; + +def imad : FoldTernaryOpPat<mul, add>; let Properties = [SDNPCommutative, SDNPAssociative] in { def smax_oneuse : HasOneUseBinOp<smax>; @@ -638,6 +655,8 @@ defm atomic_load_umax : binary_atomic_op_all_as<atomic_load_umax>; defm atomic_load_umin : binary_atomic_op_all_as<atomic_load_umin>; defm atomic_load_xor : binary_atomic_op_all_as<atomic_load_xor>; defm atomic_load_fadd : binary_atomic_op_all_as<atomic_load_fadd, 0>; +defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>; +defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>; let MemoryVT = v2f16 in defm atomic_load_fadd_v2f16 : binary_atomic_op_all_as<atomic_load_fadd, 0>; defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp index 9e86bd0c2b97..fb7148ba10ac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -14,7 +14,7 @@ #include "AMDGPU.h" #include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" @@ -46,7 +46,7 @@ class AMDGPULateCodeGenPrepare const DataLayout *DL = nullptr; AssumptionCache *AC = nullptr; - LegacyDivergenceAnalysis *DA = nullptr; + UniformityInfo *UA = nullptr; public: static char ID; @@ -59,7 +59,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<LegacyDivergenceAnalysis>(); + AU.addRequired<UniformityInfoWrapperPass>(); AU.setPreservesAll(); } @@ -91,7 +91,7 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) { return false; AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - DA = &getAnalysis<LegacyDivergenceAnalysis>(); + UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); bool Changed = false; for (auto &BB : F) @@ -122,7 +122,7 @@ bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const { if (LI.getAlign() < DL->getABITypeAlign(Ty)) return false; // It should be uniform, i.e. a scalar load. - return DA->isUniform(&LI); + return UA->isUniform(&LI); } bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { @@ -156,18 +156,14 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { IRBuilder<> IRB(&LI); IRB.SetCurrentDebugLocation(LI.getDebugLoc()); - unsigned AS = LI.getPointerAddressSpace(); - unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8; + unsigned LdBits = DL->getTypeStoreSizeInBits(LI.getType()); auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits); - PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS); - PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS); - auto *NewPtr = IRB.CreateBitCast( - IRB.CreateConstGEP1_64( - IRB.getInt8Ty(), - IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy), - Offset - Adjust), - Int32PtrTy); + auto *NewPtr = IRB.CreateConstGEP1_64( + IRB.getInt8Ty(), + IRB.CreateAddrSpaceCast(Base, LI.getPointerOperand()->getType()), + Offset - Adjust); + LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4)); NewLd->copyMetadata(LI); NewLd->setMetadata(LLVMContext::MD_range, nullptr); @@ -184,7 +180,7 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR late optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR late optimizations", false, false) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 41cb0a99b420..120c00b14a36 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" @@ -131,6 +132,42 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { }; } +// Increase the number of vector elements to reach the next legal RegClass. +static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + const unsigned NumElts = Ty.getNumElements(); + const unsigned EltSize = Ty.getElementType().getSizeInBits(); + const unsigned MaxNumElts = MaxRegisterSize / EltSize; + + assert(EltSize == 32 || EltSize == 64); + assert(Ty.getSizeInBits() < MaxRegisterSize); + + unsigned NewNumElts; + // Find the nearest legal RegClass that is larger than the current type. + for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) { + if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize)) + break; + } + + return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize)); + }; +} + +static LLT getBufferRsrcScalarType(const LLT Ty) { + if (!Ty.isVector()) + return LLT::scalar(128); + const ElementCount NumElems = Ty.getElementCount(); + return LLT::vector(NumElems, LLT::scalar(128)); +} + +static LLT getBufferRsrcRegisterType(const LLT Ty) { + if (!Ty.isVector()) + return LLT::fixed_vector(4, LLT::scalar(32)); + const unsigned NumElems = Ty.getElementCount().getFixedValue(); + return LLT::fixed_vector(NumElems * 4, LLT::scalar(32)); +} + static LLT getBitcastRegisterType(const LLT Ty) { const unsigned Size = Ty.getSizeInBits(); @@ -215,6 +252,15 @@ static LegalityPredicate isRegisterType(unsigned TypeIdx) { }; } +// RegisterType that doesn't have a corresponding RegClass. +static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + LLT Ty = Query.Types[TypeIdx]; + return isRegisterType(Ty) && + !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits()); + }; +} + static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { const LLT QueryTy = Query.Types[TypeIdx]; @@ -239,7 +285,7 @@ static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { // handle some operations by just promoting the register during // selection. There are also d16 loads on GFX9+ which preserve the high bits. static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, - bool IsLoad) { + bool IsLoad, bool IsAtomic) { switch (AS) { case AMDGPUAS::PRIVATE_ADDRESS: // FIXME: Private element size. @@ -249,6 +295,7 @@ static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, case AMDGPUAS::GLOBAL_ADDRESS: case AMDGPUAS::CONSTANT_ADDRESS: case AMDGPUAS::CONSTANT_ADDRESS_32BIT: + case AMDGPUAS::BUFFER_RESOURCE: // Treat constant and global as identical. SMRD loads are sometimes usable for // global loads (ideally constant address space should be eliminated) // depending on the context. Legality cannot be context dependent, but @@ -257,9 +304,10 @@ static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, // kernel. return IsLoad ? 512 : 128; default: - // Flat addresses may contextually need to be split to 32-bit parts if they - // may alias scratch depending on the subtarget. - return 128; + // FIXME: Flat addresses may contextually need to be split to 32-bit parts + // if they may alias scratch depending on the subtarget. This needs to be + // moved to custom handling to use addressMayBeAccessedAsPrivate + return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32; } } @@ -295,7 +343,9 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, if (MemSize != RegSize && RegSize != 32) return false; - if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) + if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, + Query.MMODescrs[0].Ordering != + AtomicOrdering::NotAtomic)) return false; switch (MemSize) { @@ -329,6 +379,21 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, return true; } +// The newer buffer intrinsic forms take their resource arguments as +// pointers in address space 8, aka s128 values. However, in order to not break +// SelectionDAG, the underlying operations have to continue to take v4i32 +// arguments. Therefore, we convert resource pointers - or vectors of them +// to integer values here. +static bool hasBufferRsrcWorkaround(const LLT Ty) { + if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE) + return true; + if (Ty.isVector()) { + const LLT ElemTy = Ty.getElementType(); + return hasBufferRsrcWorkaround(ElemTy); + } + return false; +} + // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so // workaround this. Eventually it should ignore the type for loads and only care // about the size. Return true in cases where we will workaround this for now by @@ -340,6 +405,9 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) { const unsigned Size = Ty.getSizeInBits(); if (Size <= 64) return false; + // Address space 8 pointers get their own workaround. + if (hasBufferRsrcWorkaround(Ty)) + return false; if (!Ty.isVector()) return true; @@ -354,7 +422,7 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) { static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { const LLT Ty = Query.Types[0]; return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && - !loadStoreBitcastWorkaround(Ty); + !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty); } /// Return true if a load or store of the type should be lowered with a bitcast @@ -392,7 +460,7 @@ static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) return false; - if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) + if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false)) return false; // A load is known dereferenceable up to the alignment, so it's legal to widen @@ -422,6 +490,80 @@ static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, Query.Types[1].getAddressSpace(), Opcode); } +/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial +/// type of the operand `idx` and then to transform it to a `p8` via bitcasts +/// and inttoptr. In addition, handle vectors of p8. Returns the new type. +static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, + MachineRegisterInfo &MRI, unsigned Idx) { + MachineOperand &MO = MI.getOperand(Idx); + + const LLT PointerTy = MRI.getType(MO.getReg()); + + // Paranoidly prevent us from doing this multiple times. + if (!hasBufferRsrcWorkaround(PointerTy)) + return PointerTy; + + const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); + const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); + if (!PointerTy.isVector()) { + // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8) + const unsigned NumParts = PointerTy.getSizeInBits() / 32; + const LLT S32 = LLT::scalar(32); + + Register VectorReg = MRI.createGenericVirtualRegister(VectorTy); + std::array<Register, 4> VectorElems; + B.setInsertPt(B.getMBB(), ++B.getInsertPt()); + for (unsigned I = 0; I < NumParts; ++I) + VectorElems[I] = + B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0); + B.buildMergeValues(MO, VectorElems); + MO.setReg(VectorReg); + return VectorTy; + } + Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy); + B.setInsertPt(B.getMBB(), ++B.getInsertPt()); + auto Scalar = B.buildBitcast(ScalarTy, BitcastReg); + B.buildIntToPtr(MO, Scalar); + MO.setReg(BitcastReg); + + return VectorTy; +} + +/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is +/// the form in which the value must be in order to be passed to the low-level +/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is +/// needed in order to account for the fact that we can't define a register +/// class for s128 without breaking SelectionDAG. +static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) { + MachineRegisterInfo &MRI = *B.getMRI(); + const LLT PointerTy = MRI.getType(Pointer); + const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); + const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); + + if (!PointerTy.isVector()) { + // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32) + SmallVector<Register, 4> PointerParts; + const unsigned NumParts = PointerTy.getSizeInBits() / 32; + auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer); + for (unsigned I = 0; I < NumParts; ++I) + PointerParts.push_back(Unmerged.getReg(I)); + return B.buildBuildVector(VectorTy, PointerParts).getReg(0); + } + Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0); + return B.buildBitcast(VectorTy, Scalar).getReg(0); +} + +static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, + unsigned Idx) { + MachineOperand &MO = MI.getOperand(Idx); + + const LLT PointerTy = B.getMRI()->getType(MO.getReg()); + // Paranoidly prevent us from doing this multiple times. + if (!hasBufferRsrcWorkaround(PointerTy)) + return; + MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B)); +} + AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const GCNTargetMachine &TM) : ST(ST_) { @@ -484,6 +626,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); + const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER); + const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE); const LLT CodePtr = FlatPtr; @@ -495,6 +639,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr }; + const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr}; + const std::initializer_list<LLT> FPTypesBase = { S32, S64 }; @@ -515,17 +661,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more // elements for v3s16 getActionDefinitionsBuilder(G_PHI) - .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) - .legalFor(AllS32Vectors) - .legalFor(AllS64Vectors) - .legalFor(AddrSpaces64) - .legalFor(AddrSpaces32) - .legalIf(isPointer(0)) - .clampScalar(0, S16, S256) - .widenScalarToNextPow2(0, 32) - .clampMaxNumElements(0, S32, 16) - .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) - .scalarize(0); + .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) + .legalFor(AllS32Vectors) + .legalFor(AllS64Vectors) + .legalFor(AddrSpaces64) + .legalFor(AddrSpaces32) + .legalFor(AddrSpaces128) + .legalIf(isPointer(0)) + .clampScalar(0, S16, S256) + .widenScalarToNextPow2(0, 32) + .clampMaxNumElements(0, S32, 16) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .scalarize(0); if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { // Full set of gfx9 features. @@ -760,13 +907,31 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .clampScalar(0, S16, S64); if (ST.has16BitInsts()) { - getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) + getActionDefinitionsBuilder(G_FSQRT) + .legalFor({S32, S16}) + .customFor({S64}) + .scalarize(0) + .clampScalar(0, S16, S64); + getActionDefinitionsBuilder(G_FFLOOR) .legalFor({S32, S64, S16}) .scalarize(0) .clampScalar(0, S16, S64); + + getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) + .legalFor({{S32, S32}, {S64, S32}, {S16, S16}}) + .scalarize(0) + .maxScalarIf(typeIs(0, S16), 1, S16) + .clampScalar(1, S32, S32) + .lower(); + + getActionDefinitionsBuilder(G_FFREXP) + .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}}) + .scalarize(0) + .lower(); } else { getActionDefinitionsBuilder(G_FSQRT) - .legalFor({S32, S64}) + .legalFor({S32}) + .customFor({S64}) .scalarize(0) .clampScalar(0, S32, S64); @@ -782,6 +947,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .clampScalar(0, S32, S64); } + + getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) + .legalFor({{S32, S32}, {S64, S32}}) + .scalarize(0) + .clampScalar(0, S32, S64) + .clampScalar(1, S32, S32) + .lower(); + + getActionDefinitionsBuilder(G_FFREXP) + .customFor({{S32, S32}, {S64, S32}}) + .scalarize(0) + .minScalar(0, S32) + .clampScalar(1, S32, S32) + .lower(); } getActionDefinitionsBuilder(G_FPTRUNC) @@ -906,9 +1085,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, } getActionDefinitionsBuilder(G_PTR_ADD) - .legalIf(all(isPointer(0), sameSize(0, 1))) - .scalarize(0) - .scalarSameSizeAs(1, 0); + .unsupportedFor({BufferFatPtr, RsrcPtr}) + .legalIf(all(isPointer(0), sameSize(0, 1))) + .scalarize(0) + .scalarSameSizeAs(1, 0); getActionDefinitionsBuilder(G_PTRMASK) .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) @@ -948,15 +1128,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); // FIXME: fpow has a selection pattern that should move to custom lowering. - auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); - if (ST.has16BitInsts()) - Exp2Ops.legalFor({S32, S16}); - else - Exp2Ops.legalFor({S32}); - Exp2Ops.clampScalar(0, MinScalarFPTy, S32); - Exp2Ops.scalarize(0); - - auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); + auto &ExpOps = getActionDefinitionsBuilder(G_FPOW); if (ST.has16BitInsts()) ExpOps.customFor({{S32}, {S16}}); else @@ -968,6 +1140,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .clampScalar(0, MinScalarFPTy, S32) .lower(); + auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2}); + Log2Ops.customFor({S32}); + if (ST.has16BitInsts()) + Log2Ops.legalFor({S16}); + else + Log2Ops.customFor({S16}); + Log2Ops.scalarize(0) + .lower(); + + auto &LogOps = getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP}); + LogOps.customFor({S32, S16}); + LogOps.clampScalar(0, MinScalarFPTy, S32) + .scalarize(0); + // The 64-bit versions produce 32-bit results, but only on the SALU. getActionDefinitionsBuilder(G_CTPOP) .legalFor({{S32, S32}, {S32, S64}}) @@ -1115,7 +1301,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT PtrTy = Query.Types[1]; unsigned AS = PtrTy.getAddressSpace(); - if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) + if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, + Query.MMODescrs[0].Ordering != + AtomicOrdering::NotAtomic)) return true; // Catch weird sized loads that don't evenly divide into the access sizes @@ -1178,6 +1366,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return isLoadStoreLegal(ST, Query); }); + // The custom pointers (fat pointers, buffer resources) don't work with load + // and store at this level. Fat pointers should have been lowered to + // intrinsics before the translation to MIR. + Actions.unsupportedIf(typeInSet(1, {BufferFatPtr, RsrcPtr})); + + // Address space 8 pointers are handled by a 4xs32 load, bitcast, and + // ptrtoint. This is needed to account for the fact that we can't have i128 + // as a register class for SelectionDAG reasons. + Actions.customIf([=](const LegalityQuery &Query) -> bool { + return hasBufferRsrcWorkaround(Query.Types[0]); + }); + // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to // 64-bits. // @@ -1223,9 +1423,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (DstSize > MemSize) return std::pair(0, LLT::scalar(MemSize)); - unsigned MaxSize = maxSizeForAddrSpace(ST, - PtrTy.getAddressSpace(), - Op == G_LOAD); + unsigned MaxSize = maxSizeForAddrSpace( + ST, PtrTy.getAddressSpace(), Op == G_LOAD, + Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); if (MemSize > MaxSize) return std::pair(0, LLT::scalar(MaxSize)); @@ -1242,9 +1442,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT PtrTy = Query.Types[1]; LLT EltTy = DstTy.getElementType(); - unsigned MaxSize = maxSizeForAddrSpace(ST, - PtrTy.getAddressSpace(), - Op == G_LOAD); + unsigned MaxSize = maxSizeForAddrSpace( + ST, PtrTy.getAddressSpace(), Op == G_LOAD, + Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); // FIXME: Handle widened to power of 2 results better. This ends // up scalarizing. @@ -1284,7 +1484,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // We're probably decomposing an odd sized store. Try to split // to the widest type. TODO: Account for alignment. As-is it // should be OK, since the new parts will be further legalized. - unsigned FloorSize = PowerOf2Floor(DstSize); + unsigned FloorSize = llvm::bit_floor(DstSize); return std::pair( 0, LLT::scalarOrVector( ElementCount::getFixed(FloorSize / EltSize), EltTy)); @@ -1335,7 +1535,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, - G_ATOMICRMW_UMIN}) + G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP}) .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S64, GlobalPtr}, {S64, LocalPtr}, {S32, RegionPtr}, {S64, RegionPtr}}); @@ -1348,7 +1548,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); if (ST.hasGFX90AInsts()) Atomic.legalFor({{S64, LocalPtr}}); - if (ST.hasGFX940Insts()) + if (ST.hasAtomicDsPkAdd16Insts()) Atomic.legalFor({{V2S16, LocalPtr}}); } if (ST.hasAtomicFaddInsts()) @@ -1450,10 +1650,21 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT VecTy = Query.Types[VecTypeIdx]; const LLT IdxTy = Query.Types[IdxTypeIdx]; const unsigned EltSize = EltTy.getSizeInBits(); + const bool isLegalVecType = + !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits()); + // Address space 8 pointers are 128-bit wide values, but the logic + // below will try to bitcast them to 2N x s64, which will fail. + // Therefore, as an intermediate step, wrap extracts/insertions from a + // ptrtoint-ing the vector and scalar arguments (or inttoptring the + // extraction result) in order to produce a vector operation that can + // be handled by the logic below. + if (EltTy.isPointer() && EltSize > 64) + return true; return (EltSize == 32 || EltSize == 64) && VecTy.getSizeInBits() % 32 == 0 && VecTy.getSizeInBits() <= MaxRegisterSize && - IdxTy.getSizeInBits() == 32; + IdxTy.getSizeInBits() == 32 && + isLegalVecType; }) .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), bitcastToVectorElement32(VecTypeIdx)) @@ -1479,6 +1690,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .clampScalar(IdxTypeIdx, S32, S32) .clampMaxNumElements(VecTypeIdx, S32, 32) // TODO: Clamp elements for 64-bit vectors? + .moreElementsIf( + isIllegalRegisterType(VecTypeIdx), + moreElementsToNextExistingRegClass(VecTypeIdx)) // It should only be necessary with variable indexes. // As a last resort, lower to the stack .lower(); @@ -1533,7 +1747,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .legalForCartesianProduct(AllS64Vectors, {S64}) .clampNumElements(0, V16S32, V32S32) .clampNumElements(0, V2S64, V16S64) - .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); + .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)) + .moreElementsIf( + isIllegalRegisterType(0), + moreElementsToNextExistingRegClass(0)); if (ST.hasScalarPackInsts()) { BuildVector @@ -1575,7 +1792,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT &EltTy = Ty.getElementType(); if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) return true; - if (!isPowerOf2_32(EltTy.getSizeInBits())) + if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits())) return true; } return false; @@ -1623,8 +1840,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, Builder.widenScalarIf( [=](const LegalityQuery &Query) { const LLT Ty = Query.Types[BigTyIdx]; - return !isPowerOf2_32(Ty.getSizeInBits()) && - Ty.getSizeInBits() % 16 != 0; + return Ty.getSizeInBits() % 16 != 0; }, [=](const LegalityQuery &Query) { // Pick the next power of 2, or a multiple of 64 over 128. @@ -1778,10 +1994,16 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, case TargetOpcode::G_SEXTLOAD: case TargetOpcode::G_ZEXTLOAD: return legalizeLoad(Helper, MI); + case TargetOpcode::G_STORE: + return legalizeStore(Helper, MI); case TargetOpcode::G_FMAD: return legalizeFMad(MI, MRI, B); case TargetOpcode::G_FDIV: return legalizeFDIV(MI, MRI, B); + case TargetOpcode::G_FFREXP: + return legalizeFFREXP(MI, MRI, B); + case TargetOpcode::G_FSQRT: + return legalizeFSQRT(MI, MRI, B); case TargetOpcode::G_UDIV: case TargetOpcode::G_UREM: case TargetOpcode::G_UDIVREM: @@ -1792,10 +2014,13 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, return legalizeSignedDIV_REM(MI, MRI, B); case TargetOpcode::G_ATOMIC_CMPXCHG: return legalizeAtomicCmpXChg(MI, MRI, B); + case TargetOpcode::G_FLOG2: + return legalizeFlog2(MI, B); case TargetOpcode::G_FLOG: - return legalizeFlog(MI, B, numbers::ln2f); case TargetOpcode::G_FLOG10: - return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); + return legalizeFlogCommon(MI, B); + case TargetOpcode::G_FEXP2: + return legalizeFExp2(MI, B); case TargetOpcode::G_FEXP: return legalizeFExp(MI, B); case TargetOpcode::G_FPOW: @@ -1856,7 +2081,8 @@ Register AMDGPULegalizerInfo::getSegmentAperture( LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); // For code object version 5, private_base and shared_base are passed through // implicit kernargs. - if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { + if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >= + AMDGPU::AMDHSA_COV5) { AMDGPUTargetLowering::ImplicitParameter Param = AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE : AMDGPUTargetLowering::PRIVATE_BASE; @@ -2192,9 +2418,7 @@ bool AMDGPULegalizerInfo::legalizeITOFP( : B.buildUITOFP(S64, Unmerge.getReg(1)); auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); - auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) - .addUse(CvtHi.getReg(0)) - .addUse(ThirtyTwo.getReg(0)); + auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo); // TODO: Should this propagate fast-math-flags? B.buildFAdd(Dst, LdExp, CvtLo); @@ -2225,10 +2449,7 @@ bool AMDGPULegalizerInfo::legalizeITOFP( auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); - B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, - /*HasSideEffects=*/false) - .addUse(FVal.getReg(0)) - .addUse(Scale.getReg(0)); + B.buildFLdexp(Dst, FVal, Scale); MI.eraseFromParent(); return true; } @@ -2273,13 +2494,15 @@ bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, } MachineInstrBuilder K0, K1; if (SrcLT == S64) { - K0 = B.buildFConstant(S64, - BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); - K1 = B.buildFConstant(S64, - BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); + K0 = B.buildFConstant( + S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000))); + K1 = B.buildFConstant( + S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); } else { - K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); - K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); + K0 = B.buildFConstant( + S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000))); + K1 = B.buildFConstant( + S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000))); } auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); @@ -2329,6 +2552,30 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt( // TODO: Promote dynamic indexing of s16 to s32 + Register Dst = MI.getOperand(0).getReg(); + Register Vec = MI.getOperand(1).getReg(); + + LLT VecTy = MRI.getType(Vec); + LLT EltTy = VecTy.getElementType(); + assert(EltTy == MRI.getType(Dst)); + + // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts + // but we can't go directly to that logic becasue you can't bitcast a vector + // of pointers to a vector of integers. Therefore, introduce an intermediate + // vector of integers using ptrtoint (and inttoptr on the output) in order to + // drive the legalization forward. + if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { + LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); + LLT IntVecTy = VecTy.changeElementType(IntTy); + + auto IntVec = B.buildPtrToInt(IntVecTy, Vec); + auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2)); + B.buildIntToPtr(Dst, IntElt); + + MI.eraseFromParent(); + return true; + } + // FIXME: Artifact combiner probably should have replaced the truncated // constant before this, so we shouldn't need // getIConstantVRegValWithLookThrough. @@ -2338,13 +2585,6 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt( return true; const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); - Register Dst = MI.getOperand(0).getReg(); - Register Vec = MI.getOperand(1).getReg(); - - LLT VecTy = MRI.getType(Vec); - LLT EltTy = VecTy.getElementType(); - assert(EltTy == MRI.getType(Dst)); - if (IdxVal < VecTy.getNumElements()) { auto Unmerge = B.buildUnmerge(EltTy, Vec); B.buildCopy(Dst, Unmerge.getReg(IdxVal)); @@ -2363,6 +2603,33 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt( // TODO: Promote dynamic indexing of s16 to s32 + Register Dst = MI.getOperand(0).getReg(); + Register Vec = MI.getOperand(1).getReg(); + Register Ins = MI.getOperand(2).getReg(); + + LLT VecTy = MRI.getType(Vec); + LLT EltTy = VecTy.getElementType(); + assert(EltTy == MRI.getType(Ins)); + + // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts + // but we can't go directly to that logic becasue you can't bitcast a vector + // of pointers to a vector of integers. Therefore, make the pointer vector + // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd + // new value, and then inttoptr the result vector back. This will then allow + // the rest of legalization to take over. + if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { + LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); + LLT IntVecTy = VecTy.changeElementType(IntTy); + + auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec); + auto IntIns = B.buildPtrToInt(IntTy, Ins); + auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns, + MI.getOperand(3)); + B.buildIntToPtr(Dst, IntVecDest); + MI.eraseFromParent(); + return true; + } + // FIXME: Artifact combiner probably should have replaced the truncated // constant before this, so we shouldn't need // getIConstantVRegValWithLookThrough. @@ -2372,14 +2639,6 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt( return true; const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); - Register Dst = MI.getOperand(0).getReg(); - Register Vec = MI.getOperand(1).getReg(); - Register Ins = MI.getOperand(2).getReg(); - - LLT VecTy = MRI.getType(Vec); - LLT EltTy = VecTy.getElementType(); - assert(EltTy == MRI.getType(Ins)); - (void)Ins; unsigned NumElts = VecTy.getNumElements(); if (IdxVal < NumElts) { @@ -2479,7 +2738,8 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, else MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); - B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); + if (!B.getMRI()->getRegClassOrNull(PCReg)) + B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); if (PtrTy.getSizeInBits() == 32) B.buildExtract(DstReg, PCReg, 0); @@ -2535,7 +2795,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( // allocated ones. They all share the same offset. if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { // Adjust alignment for that dynamic shared memory array. - MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); + MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV)); LLT S32 = LLT::scalar(32); auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); @@ -2620,6 +2880,13 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, Register ValReg = MI.getOperand(0).getReg(); LLT ValTy = MRI.getType(ValReg); + if (hasBufferRsrcWorkaround(ValTy)) { + Observer.changingInstr(MI); + castBufferRsrcFromV4I32(MI, B, MRI, 0); + Observer.changedInstr(MI); + return true; + } + MachineMemOperand *MMO = *MI.memoperands_begin(); const unsigned ValSize = ValTy.getSizeInBits(); const LLT MemTy = MMO->getMemoryType(); @@ -2677,6 +2944,24 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, return false; } +bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper, + MachineInstr &MI) const { + MachineIRBuilder &B = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *B.getMRI(); + GISelChangeObserver &Observer = Helper.Observer; + + Register DataReg = MI.getOperand(0).getReg(); + LLT DataTy = MRI.getType(DataReg); + + if (hasBufferRsrcWorkaround(DataTy)) { + Observer.changingInstr(MI); + castBufferRsrcArgToV4I32(MI, B, 0); + Observer.changedInstr(MI); + return true; + } + return false; +} + bool AMDGPULegalizerInfo::legalizeFMad( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -2688,9 +2973,11 @@ bool AMDGPULegalizerInfo::legalizeFMad( // TODO: Always legal with future ftz flag. // FIXME: Do we need just output? - if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) + if (Ty == LLT::scalar(32) && + MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()) return true; - if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) + if (Ty == LLT::scalar(16) && + MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()) return true; MachineIRBuilder HelperBuilder(MI); @@ -2724,31 +3011,449 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( return true; } -bool AMDGPULegalizerInfo::legalizeFlog( - MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { +/// Return true if it's known that \p Src can never be an f32 denormal value. +static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI, + Register Src) { + Register ExtSrc; + if (mi_match(Src, MRI, m_GFPExt(m_Reg(ExtSrc)))) + return MRI.getType(ExtSrc) == LLT::scalar(16); + return false; +} + +static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) { + if (Flags & MachineInstr::FmAfn) + return true; + const auto &Options = MF.getTarget().Options; + return Options.UnsafeFPMath || Options.ApproxFuncFPMath; +} + +static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, + unsigned Flags) { + return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) && + MF.getDenormalMode(APFloat::IEEEsingle()).Input != + DenormalMode::PreserveSign; +} + +std::pair<Register, Register> +AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src, + unsigned Flags) const { + if (allowApproxFunc(B.getMF(), Flags) || + !needsDenormHandlingF32(B.getMF(), Src, Flags)) + return {}; + + const LLT F32 = LLT::scalar(32); + auto SmallestNormal = B.buildFConstant( + F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle())); + auto IsLtSmallestNormal = + B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal); + + auto Scale32 = B.buildFConstant(F32, 0x1.0p+32); + auto One = B.buildFConstant(F32, 1.0); + auto ScaleFactor = + B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags); + auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags); + + return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)}; +} + +bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI, + MachineIRBuilder &B) const { + // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals. + // If we have to handle denormals, scale up the input and adjust the result. + + // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0) + // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0) + Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); LLT Ty = B.getMRI()->getType(Dst); unsigned Flags = MI.getFlags(); - auto Log2Operand = B.buildFLog2(Ty, Src, Flags); - auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); + if (Ty == LLT::scalar(16)) { + const LLT F32 = LLT::scalar(32); + // Nothing in half is a denormal when promoted to f32. + auto Ext = B.buildFPExt(F32, Src, Flags); + auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}, false) + .addUse(Ext.getReg(0)) + .setMIFlags(Flags); + B.buildFPTrunc(Dst, Log2, Flags); + MI.eraseFromParent(); + return true; + } + + assert(Ty == LLT::scalar(32)); + + auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags); + if (!ScaledInput) { + B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)}, false) + .addUse(Src) + .setMIFlags(Flags); + MI.eraseFromParent(); + return true; + } + + auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false) + .addUse(ScaledInput) + .setMIFlags(Flags); + + auto ThirtyTwo = B.buildFConstant(Ty, 32.0); + auto Zero = B.buildFConstant(Ty, 0.0); + auto ResultOffset = + B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags); + B.buildFSub(Dst, Log2, ResultOffset, Flags); - B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); MI.eraseFromParent(); return true; } -bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, - MachineIRBuilder &B) const { +static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y, + Register Z, unsigned Flags) { + auto FMul = B.buildFMul(Ty, X, Y, Flags); + return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0); +} + +bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, + MachineIRBuilder &B) const { + const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10; + assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG); + + MachineRegisterInfo &MRI = *B.getMRI(); + Register Dst = MI.getOperand(0).getReg(); + Register X = MI.getOperand(1).getReg(); + unsigned Flags = MI.getFlags(); + const LLT Ty = MRI.getType(X); + MachineFunction &MF = B.getMF(); + + const LLT F32 = LLT::scalar(32); + const LLT F16 = LLT::scalar(16); + + const AMDGPUTargetMachine &TM = + static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); + + if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) || + TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) { + const double Log2BaseInv = + IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2; + + if (Ty == F16 && !ST.has16BitInsts()) { + Register LogVal = MRI.createGenericVirtualRegister(F32); + auto PromoteSrc = B.buildFPExt(F32, X); + legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), Log2BaseInv, Flags); + B.buildFPTrunc(Dst, LogVal); + } else { + legalizeFlogUnsafe(B, Dst, X, Log2BaseInv, Flags); + } + + MI.eraseFromParent(); + return true; + } + + auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags); + if (ScaledInput) + X = ScaledInput; + + auto Y = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false) + .addUse(X) + .setMIFlags(Flags); + + Register R; + if (ST.hasFastFMAF32()) { + // c+cc are ln(2)/ln(10) to more than 49 bits + const float c_log10 = 0x1.344134p-2f; + const float cc_log10 = 0x1.09f79ep-26f; + + // c + cc is ln(2) to more than 49 bits + const float c_log = 0x1.62e42ep-1f; + const float cc_log = 0x1.efa39ep-25f; + + auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log); + auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log); + + R = B.buildFMul(Ty, Y, C, Flags).getReg(0); + auto NegR = B.buildFNeg(Ty, R, Flags); + auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags); + auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags); + R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0); + } else { + // ch+ct is ln(2)/ln(10) to more than 36 bits + const float ch_log10 = 0x1.344000p-2f; + const float ct_log10 = 0x1.3509f6p-18f; + + // ch + ct is ln(2) to more than 36 bits + const float ch_log = 0x1.62e000p-1f; + const float ct_log = 0x1.0bfbe8p-15f; + + auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log); + auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log); + + auto MaskConst = B.buildConstant(Ty, 0xfffff000); + auto YH = B.buildAnd(Ty, Y, MaskConst); + auto YT = B.buildFSub(Ty, Y, YH, Flags); + auto YTCT = B.buildFMul(Ty, YT, CT, Flags); + + Register Mad0 = + getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags); + Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags); + R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags); + } + + const bool IsFiniteOnly = + (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) && + (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath); + + if (!IsFiniteOnly) { + // Expand isfinite(x) => fabs(x) < inf + auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); + auto Fabs = B.buildFAbs(Ty, Y); + auto IsFinite = + B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); + R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0); + } + + if (ScaledInput) { + auto Zero = B.buildFConstant(Ty, 0.0); + auto ShiftK = + B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f); + auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags); + B.buildFSub(Dst, R, Shift, Flags); + } else { + B.buildCopy(Dst, R); + } + + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, + Register Src, + double Log2BaseInverted, + unsigned Flags) const { + LLT Ty = B.getMRI()->getType(Dst); + auto Log2Operand = Ty == LLT::scalar(16) + ? B.buildFLog2(Ty, Src, Flags) + : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false) + .addUse(Src) + .setMIFlags(Flags); + auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); + B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); + return true; +} + +bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI, + MachineIRBuilder &B) const { + // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals. + // If we have to handle denormals, scale up the input and adjust the result. + Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); unsigned Flags = MI.getFlags(); LLT Ty = B.getMRI()->getType(Dst); + const LLT F16 = LLT::scalar(16); + const LLT F32 = LLT::scalar(32); + + if (Ty == F16) { + // Nothing in half is a denormal when promoted to f32. + auto Ext = B.buildFPExt(F32, Src, Flags); + auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32}, false) + .addUse(Ext.getReg(0)) + .setMIFlags(Flags); + B.buildFPTrunc(Dst, Log2, Flags); + MI.eraseFromParent(); + return true; + } + + assert(Ty == F32); + + if (allowApproxFunc(B.getMF(), Flags) || + !needsDenormHandlingF32(B.getMF(), Src, Flags)) { + B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false) + .addUse(Src) + .setMIFlags(Flags); + MI.eraseFromParent(); + return true; + } + + // bool needs_scaling = x < -0x1.f80000p+6f; + // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f); + + // -nextafter(128.0, -1) + auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f); + auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, + RangeCheckConst, Flags); + + auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f); + auto Zero = B.buildFConstant(Ty, 0.0); + auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags); + auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags); + auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false) + .addUse(AddInput.getReg(0)) + .setMIFlags(Flags); + + auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f); + auto One = B.buildFConstant(Ty, 1.0); + auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags); + B.buildFMul(Dst, Exp2, ResultScale, Flags); + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, + Register Src, + unsigned Flags) const { + LLT Ty = B.getMRI()->getType(Dst); auto K = B.buildFConstant(Ty, numbers::log2e); auto Mul = B.buildFMul(Ty, Src, K, Flags); - B.buildFExp2(Dst, Mul, Flags); + + if (Ty == LLT::scalar(32)) { + B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false) + .addUse(Mul.getReg(0)) + .setMIFlags(Flags); + } else { + B.buildFExp2(Dst, Mul.getReg(0), Flags); + } + + return true; +} + +bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, + MachineIRBuilder &B) const { + Register Dst = MI.getOperand(0).getReg(); + Register X = MI.getOperand(1).getReg(); + const unsigned Flags = MI.getFlags(); + MachineFunction &MF = B.getMF(); + MachineRegisterInfo &MRI = *B.getMRI(); + LLT Ty = MRI.getType(Dst); + const LLT F16 = LLT::scalar(16); + const LLT F32 = LLT::scalar(32); + const bool IsExp10 = false; // TODO: For some reason exp10 is missing + + if (Ty == F16) { + // v_exp_f16 (fmul x, log2e) + if (allowApproxFunc(MF, Flags)) { + // TODO: Does this really require fast? + legalizeFExpUnsafe(B, Dst, X, Flags); + MI.eraseFromParent(); + return true; + } + + // exp(f16 x) -> + // fptrunc (v_exp_f32 (fmul (fpext x), log2e)) + + // Nothing in half is a denormal when promoted to f32. + auto Ext = B.buildFPExt(F32, X, Flags); + Register Lowered = MRI.createGenericVirtualRegister(F32); + legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags); + B.buildFPTrunc(Dst, Lowered, Flags); + MI.eraseFromParent(); + return true; + } + + assert(Ty == F32); + + // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying + // library behavior. Also, is known-not-daz source sufficient? + if (allowApproxFunc(MF, Flags) && !needsDenormHandlingF32(MF, X, Flags)) { + legalizeFExpUnsafe(B, Dst, X, Flags); + MI.eraseFromParent(); + return true; + } + + // Algorithm: + // + // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) + // + // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer + // n = 64*m + j, 0 <= j < 64 + // + // e^x = 2^((64*m + j + f)/64) + // = (2^m) * (2^(j/64)) * 2^(f/64) + // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) + // + // f = x*(64/ln(2)) - n + // r = f*(ln(2)/64) = x - n*(ln(2)/64) + // + // e^x = (2^m) * (2^(j/64)) * e^r + // + // (2^(j/64)) is precomputed + // + // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + // e^r = 1 + q + // + // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + // + // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) + const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract; + Register PH, PL; + + if (ST.hasFastFMAF32()) { + const float c_exp = numbers::log2ef; + const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits + const float c_exp10 = 0x1.a934f0p+1f; + const float cc_exp10 = 0x1.2f346ep-24f; + + auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp); + PH = B.buildFMul(Ty, X, C, Flags).getReg(0); + auto NegPH = B.buildFNeg(Ty, PH, Flags); + auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags); + + auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp); + PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0); + } else { + const float ch_exp = 0x1.714000p+0f; + const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits + + const float ch_exp10 = 0x1.a92000p+1f; + const float cl_exp10 = 0x1.4f0978p-11f; + + auto MaskConst = B.buildConstant(Ty, 0xfffff000); + auto XH = B.buildAnd(Ty, X, MaskConst); + auto XL = B.buildFSub(Ty, X, XH, Flags); + + auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp); + PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0); + + auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp); + auto XLCL = B.buildFMul(Ty, XL, CL, Flags); + + Register Mad0 = + getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags); + PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags); + } + + auto E = B.buildFRint(Ty, PH, Flags); + + // It is unsafe to contract this fsub into the PH multiply. + auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract); + auto A = B.buildFAdd(Ty, PHSubE, PL, Flags); + auto IntE = B.buildFPTOSI(LLT::scalar(32), E); + + auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false) + .addUse(A.getReg(0)) + .setMIFlags(Flags); + auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags); + + auto UnderflowCheckConst = + B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f); + auto Zero = B.buildFConstant(Ty, 0.0); + auto Underflow = + B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst); + + R = B.buildSelect(Ty, Underflow, Zero, R); + + const auto &Options = MF.getTarget().Options; + + if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) { + auto OverflowCheckConst = + B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f); + + auto Overflow = + B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst); + auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); + R = B.buildSelect(Ty, Overflow, Inf, R, Flags); + } + + B.buildCopy(Dst, R); MI.eraseFromParent(); return true; } @@ -2831,7 +3536,8 @@ bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, // shouldn't matter? Register ModSrc = stripAnySourceMods(OrigSrc, MRI); - auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); + auto Const = + B.buildFConstant(S64, llvm::bit_cast<double>(0x3fefffffffffffff)); Register Min = MRI.createGenericVirtualRegister(S64); @@ -2890,15 +3596,18 @@ bool AMDGPULegalizerInfo::legalizeBuildVector( // the outer loop going over parts of the result, the outer loop should go // over parts of one of the factors. This should result in instruction // selection that makes full use of S_ADDC_U32 instructions. -void AMDGPULegalizerInfo::buildMultiply( - LegalizerHelper &Helper, MutableArrayRef<Register> Accum, - ArrayRef<Register> Src0, ArrayRef<Register> Src1, - bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const { +void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper, + MutableArrayRef<Register> Accum, + ArrayRef<Register> Src0, + ArrayRef<Register> Src1, + bool UsePartialMad64_32, + bool SeparateOddAlignedProducts) const { // Use (possibly empty) vectors of S1 registers to represent the set of // carries from one pair of positions to the next. using Carry = SmallVector<Register, 2>; MachineIRBuilder &B = Helper.MIRBuilder; + GISelKnownBits &KB = *Helper.getKnownBits(); const LLT S1 = LLT::scalar(1); const LLT S32 = LLT::scalar(32); @@ -2918,6 +3627,12 @@ void AMDGPULegalizerInfo::buildMultiply( return Zero64; }; + SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros; + for (unsigned i = 0; i < Src0.size(); ++i) { + Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero()); + Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero()); + } + // Merge the given carries into the 32-bit LocalAccum, which is modified // in-place. // @@ -2980,9 +3695,14 @@ void AMDGPULegalizerInfo::buildMultiply( if (LocalAccum.size() == 1 && (!UsePartialMad64_32 || !CarryIn.empty())) { do { + // Skip multiplication if one of the operands is 0 unsigned j1 = DstIndex - j0; + if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { + ++j0; + continue; + } auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); - if (!LocalAccum[0]) { + if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) { LocalAccum[0] = Mul.getReg(0); } else { if (CarryIn.empty()) { @@ -3022,12 +3742,17 @@ void AMDGPULegalizerInfo::buildMultiply( do { unsigned j1 = DstIndex - j0; + if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { + ++j0; + continue; + } auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, {Src0[j0], Src1[j1], Tmp}); Tmp = Mad.getReg(0); if (!HaveSmallAccum) CarryOut.push_back(Mad.getReg(1)); HaveSmallAccum = false; + ++j0; } while (j0 <= DstIndex); @@ -3170,7 +3895,6 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, B.buildMergeLikeInstr(DstReg, AccumRegs); MI.eraseFromParent(); return true; - } // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to @@ -3259,7 +3983,7 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, // TODO: Should we try to emit this once in the entry block? const LLT S32 = LLT::scalar(32); const unsigned Mask = Arg->getMask(); - const unsigned Shift = countTrailingZeros<unsigned>(Mask); + const unsigned Shift = llvm::countr_zero<unsigned>(Mask); Register AndMaskSrc = LiveIn; @@ -3432,7 +4156,7 @@ void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, // Initial estimate of inv(y). auto FloatY = B.buildUITOFP(S32, Y); auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); - auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); + auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe)); auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); auto Z = B.buildFPTOUI(S32, ScaledY); @@ -3482,21 +4206,23 @@ static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); - auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 - B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); + auto Mad = B.buildFMAD( + S32, CvtHi, // 2**32 + B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo); auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); - auto Mul1 = - B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); + auto Mul1 = B.buildFMul( + S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc))); // 2**(-32) - auto Mul2 = - B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); + auto Mul2 = B.buildFMul( + S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000))); auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); // -(2**32) - auto Mad2 = B.buildFMAD(S32, Trunc, - B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); + auto Mad2 = B.buildFMAD( + S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)), + Mul1); auto ResultLo = B.buildFPTOUI(S32, Mad2); auto ResultHi = B.buildFPTOUI(S32, Trunc); @@ -3734,13 +4460,20 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, LLT ResTy = MRI.getType(Res); const MachineFunction &MF = B.getMF(); - bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || - MI.getFlag(MachineInstr::FmAfn); - - if (!AllowInaccurateRcp) - return false; + bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) || + MF.getTarget().Options.UnsafeFPMath; if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { + if (!AllowInaccurateRcp && ResTy != LLT::scalar(16)) + return false; + + // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to + // the CI documentation has a worst case error of 1 ulp. + // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to + // use it as long as we aren't trying to use denormals. + // + // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp. + // 1 / x -> RCP(x) if (CLHS->isExactlyValue(1.0)) { B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) @@ -3751,6 +4484,8 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, return true; } + // TODO: Match rsq + // -1 / x -> RCP( FNEG(x) ) if (CLHS->isExactlyValue(-1.0)) { auto FNeg = B.buildFNeg(ResTy, RHS, Flags); @@ -3763,6 +4498,12 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, } } + // For f16 require arcp only. + // For f32 require afn+arcp. + if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) || + !MI.getFlag(MachineInstr::FmArcp))) + return false; + // x / y -> x * (1.0 / y) auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) .addUse(RHS) @@ -3847,10 +4588,9 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions // to enable denorm mode. When 'Enable' is false, disable denorm mode. -static void toggleSPDenormMode(bool Enable, - MachineIRBuilder &B, +static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, - AMDGPU::SIModeRegisterDefaults Mode) { + SIModeRegisterDefaults Mode) { // Set SP denorm mode to this value. unsigned SPDenormMode = Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); @@ -3885,7 +4625,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); - AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); + SIModeRegisterDefaults Mode = MFI->getMode(); uint16_t Flags = MI.getFlags(); @@ -3914,7 +4654,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, // FIXME: Doesn't correctly model the FP mode switch, and the FP operations // aren't modeled as reading it. - if (!Mode.allFP32Denormals()) + if (Mode.FP32Denormals != DenormalMode::getIEEE()) toggleSPDenormMode(true, B, ST, Mode); auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); @@ -3924,7 +4664,9 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); - if (!Mode.allFP32Denormals()) + // FIXME: This mishandles dynamic denormal mode. We need to query the + // current mode and restore the original. + if (Mode.FP32Denormals != DenormalMode::getIEEE()) toggleSPDenormMode(false, B, ST, Mode); auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) @@ -4025,6 +4767,41 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, return true; } +bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + Register Res0 = MI.getOperand(0).getReg(); + Register Res1 = MI.getOperand(1).getReg(); + Register Val = MI.getOperand(2).getReg(); + uint16_t Flags = MI.getFlags(); + + LLT Ty = MRI.getType(Res0); + LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32); + + auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty}, false) + .addUse(Val) + .setMIFlags(Flags); + auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy}, false) + .addUse(Val) + .setMIFlags(Flags); + + if (ST.hasFractBug()) { + auto Fabs = B.buildFAbs(Ty, Val); + auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty))); + auto IsFinite = + B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); + auto Zero = B.buildConstant(InstrExpTy, 0); + Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero); + Mant = B.buildSelect(Ty, IsFinite, Mant, Val); + } + + B.buildCopy(Res0, Mant); + B.buildSExtOrTrunc(Res1, Exp); + + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -4039,9 +4816,9 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, auto Abs = B.buildFAbs(S32, RHS, Flags); const APFloat C0Val(1.0f); - auto C0 = B.buildConstant(S32, 0x6f800000); - auto C1 = B.buildConstant(S32, 0x2f800000); - auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); + auto C0 = B.buildFConstant(S32, 0x1p+96f); + auto C1 = B.buildFConstant(S32, 0x1p-32f); + auto C2 = B.buildFConstant(S32, 1.0f); auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); @@ -4060,6 +4837,90 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, return true; } +bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + // For double type, the SQRT and RSQ instructions don't have required + // precision, we apply Goldschmidt's algorithm to improve the result: + // + // y0 = rsq(x) + // g0 = x * y0 + // h0 = 0.5 * y0 + // + // r0 = 0.5 - h0 * g0 + // g1 = g0 * r0 + g0 + // h1 = h0 * r0 + h0 + // + // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 + // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 + // h2 = h1 * r1 + h1 + // + // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 + // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 + // + // sqrt(x) = g3 + + const LLT S1 = LLT::scalar(1); + const LLT S32 = LLT::scalar(32); + const LLT F64 = LLT::scalar(64); + + Register Dst = MI.getOperand(0).getReg(); + assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt"); + + Register X = MI.getOperand(1).getReg(); + unsigned Flags = MI.getFlags(); + + auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767); + + auto ZeroInt = B.buildConstant(S32, 0); + auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant); + + // Scale up input if it is too small. + auto ScaleUpFactor = B.buildConstant(S32, 256); + auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt); + auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags); + + auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}, false) + .addReg(SqrtX.getReg(0)); + + auto Half = B.buildFConstant(F64, 0.5); + auto SqrtH0 = B.buildFMul(F64, SqrtY, Half); + auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY); + + auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0); + auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half); + + auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0); + auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0); + + auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1); + auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX); + + auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1); + + auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2); + auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX); + + auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2); + + // Scale down the result. + auto ScaleDownFactor = B.buildConstant(S32, -128); + auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt); + SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags); + + // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check + // with finite only or nsz because rsq(+/-0) = +/-inf + + // TODO: Check for DAZ and expand to subnormals + auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); + + // If x is +INF, +0, or -0, use its original value + B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags); + + MI.eraseFromParent(); + return true; +} + // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. // FIXME: Why do we handle this one but not other removed instructions? // @@ -4159,6 +5020,50 @@ bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, return true; } +/// To create a buffer resource from a 64-bit pointer, mask off the upper 32 +/// bits of the pointer and replace them with the stride argument, then +/// merge_values everything together. In the common case of a raw buffer (the +/// stride component is 0), we can just AND off the upper half. +bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + Register Result = MI.getOperand(0).getReg(); + Register Pointer = MI.getOperand(2).getReg(); + Register Stride = MI.getOperand(3).getReg(); + Register NumRecords = MI.getOperand(4).getReg(); + Register Flags = MI.getOperand(5).getReg(); + + LLT S32 = LLT::scalar(32); + + B.setInsertPt(B.getMBB(), ++B.getInsertPt()); + auto Unmerge = B.buildUnmerge(S32, Pointer); + Register LowHalf = Unmerge.getReg(0); + Register HighHalf = Unmerge.getReg(1); + + auto AndMask = B.buildConstant(S32, 0x0000ffff); + auto Masked = B.buildAnd(S32, HighHalf, AndMask); + + MachineInstrBuilder NewHighHalf = Masked; + std::optional<ValueAndVReg> StrideConst = + getIConstantVRegValWithLookThrough(Stride, MRI); + if (!StrideConst || !StrideConst->Value.isZero()) { + MachineInstrBuilder ShiftedStride; + if (StrideConst) { + uint32_t StrideVal = StrideConst->Value.getZExtValue(); + uint32_t ShiftedStrideVal = StrideVal << 16; + ShiftedStride = B.buildConstant(S32, ShiftedStrideVal); + } else { + auto ExtStride = B.buildAnyExt(S32, Stride); + auto ShiftConst = B.buildConstant(S32, 16); + ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst); + } + NewHighHalf = B.buildOr(S32, Masked, ShiftedStride); + } + Register NewHighHalfReg = NewHighHalf.getReg(0); + B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags}); + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -4227,7 +5132,7 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, std::pair<Register, unsigned> AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const { - const unsigned MaxImm = 4095; + const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(); Register BaseReg; unsigned ImmOffset; const LLT S32 = LLT::scalar(32); @@ -4240,13 +5145,14 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, if (MRI.getType(BaseReg).isPointer()) BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); - // If the immediate value is too big for the immoffset field, put the value - // and -4096 into the immoffset field so that the value that is copied/added - // for the voffset field is a multiple of 4096, and it stands more chance - // of being CSEd with the copy/add for another similar load/store. - // However, do not do that rounding down to a multiple of 4096 if that is a - // negative number, as it appears to be illegal to have a negative offset - // in the vgpr, even if adding the immediate offset makes it positive. + // If the immediate value is too big for the immoffset field, put only bits + // that would normally fit in the immoffset field. The remaining value that + // is copied/added for the voffset field is a large power of 2, and it + // stands more chance of being CSEd with the copy/add for another similar + // load/store. + // However, do not do that rounding down if that is a negative + // number, as it appears to be illegal to have a negative offset in the + // vgpr, even if adding the immediate offset makes it positive. unsigned Overflow = ImmOffset & ~MaxImm; ImmOffset -= Overflow; if ((int32_t)Overflow < 0) { @@ -4269,31 +5175,6 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, return std::pair(BaseReg, ImmOffset); } -/// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. -void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, - Register VOffset, Register SOffset, - unsigned ImmOffset, Register VIndex, - MachineRegisterInfo &MRI) const { - std::optional<ValueAndVReg> MaybeVOffsetVal = - getIConstantVRegValWithLookThrough(VOffset, MRI); - std::optional<ValueAndVReg> MaybeSOffsetVal = - getIConstantVRegValWithLookThrough(SOffset, MRI); - std::optional<ValueAndVReg> MaybeVIndexVal = - getIConstantVRegValWithLookThrough(VIndex, MRI); - // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, - // update the MMO with that offset. The stride is unknown so we can only do - // this if VIndex is constant 0. - if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && - MaybeVIndexVal->Value == 0) { - uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + - MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; - MMO->setOffset(TotalOffset); - } else { - // We don't have a constant combined offset to use in the MMO. Give up. - MMO->setValue((Value *)nullptr); - } -} - /// Handle register layout difference for f16 images for some subtargets. Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, @@ -4365,6 +5246,10 @@ Register AMDGPULegalizerInfo::fixStoreSourceType( const LLT S16 = LLT::scalar(16); + // Fixup buffer resources themselves needing to be v4i128. + if (hasBufferRsrcWorkaround(Ty)) + return castBufferRsrcToV4I32(VData, B); + // Fixup illegal register types for i8 stores. if (Ty == LLT::scalar(8) || Ty == S16) { Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); @@ -4393,6 +5278,7 @@ bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, const LLT S32 = LLT::scalar(32); VData = fixStoreSourceType(B, VData, IsFormat); + castBufferRsrcArgToV4I32(MI, B, 2); Register RSrc = MI.getOperand(2).getReg(); MachineMemOperand *MMO = *MI.memoperands_begin(); @@ -4426,7 +5312,6 @@ bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); - updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); unsigned Opc; if (IsTyped) { @@ -4510,6 +5395,7 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, ++OpOffset; } + castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset); Register RSrc = MI.getOperand(2 + OpOffset).getReg(); // The typed intrinsics add an immediate after the registers. @@ -4538,12 +5424,17 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, unsigned ImmOffset; LLT Ty = MRI.getType(Dst); + // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the + // logic doesn't have to handle that case. + if (hasBufferRsrcWorkaround(Ty)) { + Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0); + Dst = MI.getOperand(0).getReg(); + } LLT EltTy = Ty.getScalarType(); const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); const bool Unpacked = ST.hasUnpackedD16VMem(); std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); - updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); unsigned Opc; @@ -4624,69 +5515,87 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, return true; } -bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, - MachineIRBuilder &B, - bool IsInc) const { - unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : - AMDGPU::G_AMDGPU_ATOMIC_DEC; - B.buildInstr(Opc) - .addDef(MI.getOperand(0).getReg()) - .addUse(MI.getOperand(2).getReg()) - .addUse(MI.getOperand(3).getReg()) - .cloneMemRefs(MI); - MI.eraseFromParent(); - return true; -} - static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { switch (IntrID) { case Intrinsic::amdgcn_raw_buffer_atomic_swap: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: case Intrinsic::amdgcn_struct_buffer_atomic_swap: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; case Intrinsic::amdgcn_raw_buffer_atomic_add: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: case Intrinsic::amdgcn_struct_buffer_atomic_add: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; case Intrinsic::amdgcn_raw_buffer_atomic_sub: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: case Intrinsic::amdgcn_struct_buffer_atomic_sub: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; case Intrinsic::amdgcn_raw_buffer_atomic_smin: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: case Intrinsic::amdgcn_struct_buffer_atomic_smin: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; case Intrinsic::amdgcn_raw_buffer_atomic_umin: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: case Intrinsic::amdgcn_struct_buffer_atomic_umin: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; case Intrinsic::amdgcn_raw_buffer_atomic_smax: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: case Intrinsic::amdgcn_struct_buffer_atomic_smax: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; case Intrinsic::amdgcn_raw_buffer_atomic_umax: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: case Intrinsic::amdgcn_struct_buffer_atomic_umax: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; case Intrinsic::amdgcn_raw_buffer_atomic_and: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: case Intrinsic::amdgcn_struct_buffer_atomic_and: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; case Intrinsic::amdgcn_raw_buffer_atomic_or: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: case Intrinsic::amdgcn_struct_buffer_atomic_or: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; case Intrinsic::amdgcn_raw_buffer_atomic_xor: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: case Intrinsic::amdgcn_struct_buffer_atomic_xor: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; case Intrinsic::amdgcn_raw_buffer_atomic_inc: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: case Intrinsic::amdgcn_struct_buffer_atomic_inc: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; case Intrinsic::amdgcn_raw_buffer_atomic_dec: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: case Intrinsic::amdgcn_struct_buffer_atomic_dec: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; case Intrinsic::amdgcn_raw_buffer_atomic_fadd: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_buffer_atomic_fadd: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; case Intrinsic::amdgcn_raw_buffer_atomic_fmin: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: case Intrinsic::amdgcn_struct_buffer_atomic_fmin: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; case Intrinsic::amdgcn_raw_buffer_atomic_fmax: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: case Intrinsic::amdgcn_struct_buffer_atomic_fmax: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; default: llvm_unreachable("unhandled atomic opcode"); @@ -4696,8 +5605,11 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const { - const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || - IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; + const bool IsCmpSwap = + IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || + IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap || + IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap || + IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap; const bool HasReturn = MI.getNumExplicitDefs() != 0; Register Dst; @@ -4710,6 +5622,8 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, OpOffset = -1; } + // Since we don't have 128-bit atomics, we don't need to handle the case of + // p8 argmunents to the atomic itself Register VData = MI.getOperand(2 + OpOffset).getReg(); Register CmpVal; @@ -4718,6 +5632,7 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, ++OpOffset; } + castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset); Register RSrc = MI.getOperand(3 + OpOffset).getReg(); const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; @@ -4739,7 +5654,6 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, unsigned ImmOffset; std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); - updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); @@ -4896,7 +5810,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); LLT AddrTy = MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); - const bool IsG16 = GradTy == S16; + const bool IsG16 = + ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16; const bool IsA16 = AddrTy == S16; const bool IsD16 = Ty.getScalarType() == S16; @@ -4967,6 +5882,9 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( return false; } + const unsigned NSAMaxSize = ST.getNSAMaxSize(); + const unsigned HasPartialNSA = ST.hasPartialNSAEncoding(); + if (IsA16 || IsG16) { if (Intr->NumVAddrs > 1) { SmallVector<Register, 4> PackedRegs; @@ -4977,9 +5895,19 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( // See also below in the non-a16 branch const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= ST.getNSAThreshold(MF) && - PackedRegs.size() <= ST.getNSAMaxSize(); + (PackedRegs.size() <= NSAMaxSize || HasPartialNSA); + const bool UsePartialNSA = + UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize; - if (!UseNSA && PackedRegs.size() > 1) { + if (UsePartialNSA) { + // Pack registers that would go over NSAMaxSize into last VAddr register + LLT PackedAddrTy = + LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16); + auto Concat = B.buildConcatVectors( + PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1)); + PackedRegs[NSAMaxSize - 1] = Concat.getReg(0); + PackedRegs.resize(NSAMaxSize); + } else if (!UseNSA && PackedRegs.size() > 1) { LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); PackedRegs[0] = Concat.getReg(0); @@ -5015,16 +5943,22 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( // SIShrinkInstructions will convert NSA encodings to non-NSA after register // allocation when possible. // - // TODO: we can actually allow partial NSA where the final register is a - // contiguous set of the remaining addresses. - // This could help where there are more addresses than supported. + // Partial NSA is allowed on GFX11 where the final register is a contiguous + // set of the remaining addresses. const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= ST.getNSAThreshold(MF) && - CorrectedNumVAddrs <= ST.getNSAMaxSize(); + (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA); + const bool UsePartialNSA = + UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize; - if (!UseNSA && Intr->NumVAddrs > 1) + if (UsePartialNSA) { + convertImageAddrToPacked(B, MI, + ArgOffset + Intr->VAddrStart + NSAMaxSize - 1, + Intr->NumVAddrs - NSAMaxSize + 1); + } else if (!UseNSA && Intr->NumVAddrs > 1) { convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, Intr->NumVAddrs); + } } int Flags = 0; @@ -5237,6 +6171,12 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad( Observer.changingInstr(MI); + // Handle needing to s.buffer.load() a p8 value. + if (hasBufferRsrcWorkaround(Ty)) { + Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0); + Dst = MI.getOperand(0).getReg(); + B.setInsertPt(B.getMBB(), MI); + } if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { Ty = getBitcastRegisterType(Ty); Helper.bitcastDst(MI, Ty, 0); @@ -5283,25 +6223,40 @@ bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) return legalizeTrapEndpgm(MI, MRI, B); - if (std::optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { - switch (*HsaAbiVer) { - case ELF::ELFABIVERSION_AMDGPU_HSA_V2: - case ELF::ELFABIVERSION_AMDGPU_HSA_V3: - return legalizeTrapHsaQueuePtr(MI, MRI, B); - case ELF::ELFABIVERSION_AMDGPU_HSA_V4: - case ELF::ELFABIVERSION_AMDGPU_HSA_V5: - return ST.supportsGetDoorbellID() ? - legalizeTrapHsa(MI, MRI, B) : - legalizeTrapHsaQueuePtr(MI, MRI, B); - } - } + const Module *M = B.getMF().getFunction().getParent(); + unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M); + if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3) + return legalizeTrapHsaQueuePtr(MI, MRI, B); - llvm_unreachable("Unknown trap handler"); + return ST.supportsGetDoorbellID() ? + legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B); } bool AMDGPULegalizerInfo::legalizeTrapEndpgm( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock &BB = B.getMBB(); + MachineFunction *MF = BB.getParent(); + + if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) { + BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) + .addImm(0); + MI.eraseFromParent(); + return true; + } + + // We need a block split to make the real endpgm a terminator. We also don't + // want to break phis in successor blocks, so we can't just delete to the + // end of the block. + BB.splitAt(MI, false /*UpdateLiveIns*/); + MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); + MF->push_back(TrapBB); + BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) + .addImm(0); + BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ)) + .addMBB(TrapBB); + + BB.addSuccessor(TrapBB); MI.eraseFromParent(); return true; } @@ -5313,7 +6268,8 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( Register SGPR01(AMDGPU::SGPR0_SGPR1); // For code object version 5, queue_ptr is passed through implicit kernarg. - if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { + if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >= + AMDGPU::AMDHSA_COV5) { AMDGPUTargetLowering::ImplicitParameter Param = AMDGPUTargetLowering::QUEUE_PTR; uint64_t Offset = @@ -5652,6 +6608,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return false; } + case Intrinsic::amdgcn_make_buffer_rsrc: + return legalizePointerAsRsrcIntrin(MI, MRI, B); case Intrinsic::amdgcn_kernarg_segment_ptr: if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { // This only makes sense to call in a kernel, so just lower to null. @@ -5736,60 +6694,100 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_s_buffer_load: return legalizeSBufferLoad(Helper, MI); case Intrinsic::amdgcn_raw_buffer_store: + case Intrinsic::amdgcn_raw_ptr_buffer_store: case Intrinsic::amdgcn_struct_buffer_store: + case Intrinsic::amdgcn_struct_ptr_buffer_store: return legalizeBufferStore(MI, MRI, B, false, false); case Intrinsic::amdgcn_raw_buffer_store_format: + case Intrinsic::amdgcn_raw_ptr_buffer_store_format: case Intrinsic::amdgcn_struct_buffer_store_format: + case Intrinsic::amdgcn_struct_ptr_buffer_store_format: return legalizeBufferStore(MI, MRI, B, false, true); case Intrinsic::amdgcn_raw_tbuffer_store: + case Intrinsic::amdgcn_raw_ptr_tbuffer_store: case Intrinsic::amdgcn_struct_tbuffer_store: + case Intrinsic::amdgcn_struct_ptr_tbuffer_store: return legalizeBufferStore(MI, MRI, B, true, true); case Intrinsic::amdgcn_raw_buffer_load: + case Intrinsic::amdgcn_raw_ptr_buffer_load: case Intrinsic::amdgcn_struct_buffer_load: + case Intrinsic::amdgcn_struct_ptr_buffer_load: return legalizeBufferLoad(MI, MRI, B, false, false); case Intrinsic::amdgcn_raw_buffer_load_format: + case Intrinsic::amdgcn_raw_ptr_buffer_load_format: case Intrinsic::amdgcn_struct_buffer_load_format: + case Intrinsic::amdgcn_struct_ptr_buffer_load_format: return legalizeBufferLoad(MI, MRI, B, true, false); case Intrinsic::amdgcn_raw_tbuffer_load: + case Intrinsic::amdgcn_raw_ptr_tbuffer_load: case Intrinsic::amdgcn_struct_tbuffer_load: + case Intrinsic::amdgcn_struct_ptr_tbuffer_load: return legalizeBufferLoad(MI, MRI, B, true, true); case Intrinsic::amdgcn_raw_buffer_atomic_swap: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: case Intrinsic::amdgcn_struct_buffer_atomic_swap: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: case Intrinsic::amdgcn_raw_buffer_atomic_add: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: case Intrinsic::amdgcn_struct_buffer_atomic_add: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: case Intrinsic::amdgcn_raw_buffer_atomic_sub: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: case Intrinsic::amdgcn_struct_buffer_atomic_sub: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: case Intrinsic::amdgcn_raw_buffer_atomic_smin: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: case Intrinsic::amdgcn_struct_buffer_atomic_smin: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: case Intrinsic::amdgcn_raw_buffer_atomic_umin: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: case Intrinsic::amdgcn_struct_buffer_atomic_umin: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: case Intrinsic::amdgcn_raw_buffer_atomic_smax: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: case Intrinsic::amdgcn_struct_buffer_atomic_smax: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: case Intrinsic::amdgcn_raw_buffer_atomic_umax: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: case Intrinsic::amdgcn_struct_buffer_atomic_umax: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: case Intrinsic::amdgcn_raw_buffer_atomic_and: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: case Intrinsic::amdgcn_struct_buffer_atomic_and: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: case Intrinsic::amdgcn_raw_buffer_atomic_or: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: case Intrinsic::amdgcn_struct_buffer_atomic_or: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: case Intrinsic::amdgcn_raw_buffer_atomic_xor: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: case Intrinsic::amdgcn_struct_buffer_atomic_xor: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: case Intrinsic::amdgcn_raw_buffer_atomic_inc: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: case Intrinsic::amdgcn_struct_buffer_atomic_inc: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: case Intrinsic::amdgcn_raw_buffer_atomic_dec: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: case Intrinsic::amdgcn_struct_buffer_atomic_dec: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: case Intrinsic::amdgcn_raw_buffer_atomic_fmin: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: case Intrinsic::amdgcn_struct_buffer_atomic_fmin: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: case Intrinsic::amdgcn_raw_buffer_atomic_fmax: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: case Intrinsic::amdgcn_struct_buffer_atomic_fmax: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: case Intrinsic::amdgcn_raw_buffer_atomic_fadd: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_buffer_atomic_fadd: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: return legalizeBufferAtomic(MI, B, IntrID); - case Intrinsic::amdgcn_atomic_inc: - return legalizeAtomicIncDec(MI, B, true); - case Intrinsic::amdgcn_atomic_dec: - return legalizeAtomicIncDec(MI, B, false); case Intrinsic::trap: return legalizeTrapIntrinsic(MI, MRI, B); case Intrinsic::debugtrap: @@ -5802,6 +6800,17 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); case Intrinsic::amdgcn_image_bvh_intersect_ray: return legalizeBVHIntrinsic(MI, B); + case Intrinsic::amdgcn_fmed3: { + GISelChangeObserver &Observer = Helper.Observer; + + // FIXME: This is to workaround the inability of tablegen match combiners to + // match intrinsics in patterns. + Observer.changingInstr(MI); + MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3)); + MI.removeOperand(1); + Observer.changedInstr(MI); + return true; + } default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 37c987108bc4..04773f275c87 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -71,14 +71,24 @@ public: bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const; + bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const; bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; - bool legalizeFlog(MachineInstr &MI, MachineIRBuilder &B, - double Log2BaseInverted) const; + + std::pair<Register, Register> + getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const; + + bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const; + bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const; + bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, + double Log2BaseInverted, unsigned Flags) const; + bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const; + bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, + unsigned Flags) const; bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const; bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, @@ -101,6 +111,9 @@ public: bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; + bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizePreloadedArgIntrin( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const; @@ -135,6 +148,8 @@ public: MachineIRBuilder &B) const; bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, @@ -142,6 +157,9 @@ public: bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; @@ -165,16 +183,9 @@ public: std::pair<Register, unsigned> splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const; - void updateBufferMMO(MachineMemOperand *MMO, Register VOffset, - Register SOffset, unsigned ImmOffset, Register VIndex, - MachineRegisterInfo &MRI) const; Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore = false) const; - bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B, bool IsFormat) const; - bool legalizeRawBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B, bool IsFormat) const; Register fixStoreSourceType(MachineIRBuilder &B, Register VData, bool IsFormat) const; @@ -198,9 +209,6 @@ public: bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const; - bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B, - bool IsInc) const; - bool legalizeTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp deleted file mode 100644 index 93d1eed2cf63..000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ /dev/null @@ -1,177 +0,0 @@ -//===-- AMDGPULowerIntrinsics.cpp -----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/IntrinsicsR600.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Transforms/Utils/LowerMemIntrinsics.h" - -#define DEBUG_TYPE "amdgpu-lower-intrinsics" - -using namespace llvm; - -namespace { - -static int MaxStaticSize; - -static cl::opt<int, true> MemIntrinsicExpandSizeThresholdOpt( - "amdgpu-mem-intrinsic-expand-size", - cl::desc("Set minimum mem intrinsic size to expand in IR"), - cl::location(MaxStaticSize), - cl::init(1024), - cl::Hidden); - - -class AMDGPULowerIntrinsics : public ModulePass { -private: - bool makeLIDRangeMetadata(Function &F) const; - -public: - static char ID; - - AMDGPULowerIntrinsics() : ModulePass(ID) {} - - bool runOnModule(Module &M) override; - bool expandMemIntrinsicUses(Function &F); - StringRef getPassName() const override { - return "AMDGPU Lower Intrinsics"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<TargetTransformInfoWrapperPass>(); - } -}; - -} - -char AMDGPULowerIntrinsics::ID = 0; - -char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID; - -INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, "Lower intrinsics", false, - false) - -// TODO: Should refine based on estimated number of accesses (e.g. does it -// require splitting based on alignment) -static bool shouldExpandOperationWithSize(Value *Size) { - ConstantInt *CI = dyn_cast<ConstantInt>(Size); - return !CI || (CI->getSExtValue() > MaxStaticSize); -} - -bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) { - Intrinsic::ID ID = F.getIntrinsicID(); - bool Changed = false; - - for (User *U : llvm::make_early_inc_range(F.users())) { - Instruction *Inst = cast<Instruction>(U); - - switch (ID) { - case Intrinsic::memcpy: { - auto *Memcpy = cast<MemCpyInst>(Inst); - if (shouldExpandOperationWithSize(Memcpy->getLength())) { - Function *ParentFunc = Memcpy->getParent()->getParent(); - const TargetTransformInfo &TTI = - getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*ParentFunc); - expandMemCpyAsLoop(Memcpy, TTI); - Changed = true; - Memcpy->eraseFromParent(); - } - - break; - } - case Intrinsic::memmove: { - auto *Memmove = cast<MemMoveInst>(Inst); - if (shouldExpandOperationWithSize(Memmove->getLength())) { - expandMemMoveAsLoop(Memmove); - Changed = true; - Memmove->eraseFromParent(); - } - - break; - } - case Intrinsic::memset: { - auto *Memset = cast<MemSetInst>(Inst); - if (shouldExpandOperationWithSize(Memset->getLength())) { - expandMemSetAsLoop(Memset); - Changed = true; - Memset->eraseFromParent(); - } - - break; - } - default: - break; - } - } - - return Changed; -} - -bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const { - auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); - if (!TPC) - return false; - - const TargetMachine &TM = TPC->getTM<TargetMachine>(); - bool Changed = false; - - for (auto *U : F.users()) { - auto *CI = dyn_cast<CallInst>(U); - if (!CI) - continue; - - Function *Caller = CI->getParent()->getParent(); - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *Caller); - Changed |= ST.makeLIDRangeMetadata(CI); - } - return Changed; -} - -bool AMDGPULowerIntrinsics::runOnModule(Module &M) { - bool Changed = false; - - for (Function &F : M) { - if (!F.isDeclaration()) - continue; - - switch (F.getIntrinsicID()) { - case Intrinsic::memcpy: - case Intrinsic::memmove: - case Intrinsic::memset: - if (expandMemIntrinsicUses(F)) - Changed = true; - break; - - case Intrinsic::r600_read_tidig_x: - case Intrinsic::r600_read_tidig_y: - case Intrinsic::r600_read_tidig_z: - case Intrinsic::r600_read_local_size_x: - case Intrinsic::r600_read_local_size_y: - case Intrinsic::r600_read_local_size_z: - Changed |= makeLIDRangeMetadata(F); - break; - - default: - break; - } - } - - return Changed; -} - -ModulePass *llvm::createAMDGPULowerIntrinsicsPass() { - return new AMDGPULowerIntrinsics(); -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index f3ff9b753585..f5323725250f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -70,7 +70,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { IRBuilder<> Builder(&*getInsertPt(EntryBlock)); const Align KernArgBaseAlign(16); // FIXME: Increase if necessary - const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F); + const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(); Align MaxAlign; // FIXME: Alignment is broken with explicit arg offset.; @@ -86,7 +86,6 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { KernArgSegment->addRetAttr( Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); - unsigned AS = KernArgSegment->getType()->getPointerAddressSpace(); uint64_t ExplicitArgOffset = 0; for (Argument &Arg : F.args()) { @@ -111,8 +110,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { Builder.getInt8Ty(), KernArgSegment, EltOffset, Arg.getName() + ".byval.kernarg.offset"); - Value *CastOffsetPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( - ArgOffsetPtr, Arg.getType()); + Value *CastOffsetPtr = + Builder.CreateAddrSpaceCast(ArgOffsetPtr, Arg.getType()); Arg.replaceAllUsesWith(CastOffsetPtr); continue; } @@ -170,8 +169,6 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { AdjustedArgTy = V4Ty; } - ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS), - ArgPtr->getName() + ".cast"); LoadInst *Load = Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign); Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {})); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp index 56e5e0708492..26074cf06071 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp @@ -322,7 +322,7 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) { // TargetPassConfig for subtarget. bool AMDGPULowerKernelAttributes::runOnModule(Module &M) { bool MadeChange = false; - bool IsV5OrAbove = AMDGPU::getAmdhsaCodeObjectVersion() >= 5; + bool IsV5OrAbove = AMDGPU::getCodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5; Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove); if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. @@ -354,7 +354,8 @@ ModulePass *llvm::createAMDGPULowerKernelAttributesPass() { PreservedAnalyses AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) { - bool IsV5OrAbove = AMDGPU::getAmdhsaCodeObjectVersion() >= 5; + bool IsV5OrAbove = + AMDGPU::getCodeObjectVersion(*F.getParent()) >= AMDGPU::AMDHSA_COV5; Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove); if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used. diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 11ba5c91dae9..e3a645977f92 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -20,9 +20,8 @@ // This model means the GPU runtime can specify the amount of memory allocated. // If this is more than the kernel assumed, the excess can be made available // using a language specific feature, which IR represents as a variable with -// no initializer. This feature is not yet implemented for non-kernel functions. -// This lowering could be extended to handle that use case, but would probably -// require closer integration with promoteAllocaToLDS. +// no initializer. This feature is referred to here as "Dynamic LDS" and is +// lowered slightly differently to the normal case. // // Consequences of this GPU feature: // - memory is limited and exceeding it halts compilation @@ -65,17 +64,15 @@ // Kernel | Yes | Yes | No | // Hybrid | Yes | Partial | Yes | // -// Module spends LDS memory to save cycles. Table spends cycles and global -// memory to save LDS. Kernel is as fast as kernel allocation but only works -// for variables that are known reachable from a single kernel. Hybrid picks -// between all three. When forced to choose between LDS and cycles it minimises +// "Module" spends LDS memory to save cycles. "Table" spends cycles and global +// memory to save LDS. "Kernel" is as fast as kernel allocation but only works +// for variables that are known reachable from a single kernel. "Hybrid" picks +// between all three. When forced to choose between LDS and cycles we minimise // LDS use. // The "module" lowering implemented here finds LDS variables which are used by // non-kernel functions and creates a new struct with a field for each of those // LDS variables. Variables that are only used from kernels are excluded. -// Kernels that do not use this struct are annoteated with the attribute -// amdgpu-elide-module-lds which allows the back end to elide the allocation. // // The "table" lowering implemented here has three components. // First kernels are assigned a unique integer identifier which is available in @@ -115,6 +112,68 @@ // use LDS are expected to hit the "Kernel" lowering strategy // - The runtime properties impose a cost in compiler implementation complexity // +// Dynamic LDS implementation +// Dynamic LDS is lowered similarly to the "table" strategy above and uses the +// same intrinsic to identify which kernel is at the root of the dynamic call +// graph. This relies on the specified behaviour that all dynamic LDS variables +// alias one another, i.e. are at the same address, with respect to a given +// kernel. Therefore this pass creates new dynamic LDS variables for each kernel +// that allocates any dynamic LDS and builds a table of addresses out of those. +// The AMDGPUPromoteAlloca pass skips kernels that use dynamic LDS. +// The corresponding optimisation for "kernel" lowering where the table lookup +// is elided is not implemented. +// +// +// Implementation notes / limitations +// A single LDS global variable represents an instance per kernel that can reach +// said variables. This pass essentially specialises said variables per kernel. +// Handling ConstantExpr during the pass complicated this significantly so now +// all ConstantExpr uses of LDS variables are expanded to instructions. This +// may need amending when implementing non-undef initialisers. +// +// Lowering is split between this IR pass and the back end. This pass chooses +// where given variables should be allocated and marks them with metadata, +// MD_absolute_symbol. The backend places the variables in coincidentally the +// same location and raises a fatal error if something has gone awry. This works +// in practice because the only pass between this one and the backend that +// changes LDS is PromoteAlloca and the changes it makes do not conflict. +// +// Addresses are written to constant global arrays based on the same metadata. +// +// The backend lowers LDS variables in the order of traversal of the function. +// This is at odds with the deterministic layout required. The workaround is to +// allocate the fixed-address variables immediately upon starting the function +// where they can be placed as intended. This requires a means of mapping from +// the function to the variables that it allocates. For the module scope lds, +// this is via metadata indicating whether the variable is not required. If a +// pass deletes that metadata, a fatal error on disagreement with the absolute +// symbol metadata will occur. For kernel scope and dynamic, this is by _name_ +// correspondence between the function and the variable. It requires the +// kernel to have a name (which is only a limitation for tests in practice) and +// for nothing to rename the corresponding symbols. This is a hazard if the pass +// is run multiple times during debugging. Alternative schemes considered all +// involve bespoke metadata. +// +// If the name correspondence can be replaced, multiple distinct kernels that +// have the same memory layout can map to the same kernel id (as the address +// itself is handled by the absolute symbol metadata) and that will allow more +// uses of the "kernel" style faster lowering and reduce the size of the lookup +// tables. +// +// There is a test that checks this does not fire for a graphics shader. This +// lowering is expected to work for graphics if the isKernel test is changed. +// +// The current markUsedByKernel is sufficient for PromoteAlloca but is elided +// before codegen. Replacing this with an equivalent intrinsic which lasts until +// shortly after the machine function lowering of LDS would help break the name +// mapping. The other part needed is probably to amend PromoteAlloca to embed +// the LDS variables it creates in the same struct created here. That avoids the +// current hazard where a PromoteAlloca LDS variable might be allocated before +// the kernel scope (and thus error on the address check). Given a new invariant +// that no LDS variables exist outside of the structs managed here, and an +// intrinsic that lasts until after the LDS frame lowering, it should be +// possible to drop the name mapping and fold equivalent memory layouts. +// //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -134,11 +193,14 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/ReplaceConstant.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" #include "llvm/Support/OptimizedStructLayout.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ModuleUtils.h" @@ -162,7 +224,7 @@ enum class LoweringKind { module, table, kernel, hybrid }; cl::opt<LoweringKind> LoweringKindLoc( "amdgpu-lower-module-lds-strategy", cl::desc("Specify lowering strategy for function LDS access:"), cl::Hidden, - cl::init(LoweringKind::module), + cl::init(LoweringKind::hybrid), cl::values( clEnumValN(LoweringKind::table, "table", "Lower via table lookup"), clEnumValN(LoweringKind::module, "module", "Lower via module struct"), @@ -183,6 +245,13 @@ bool isKernelLDS(const Function *F) { return AMDGPU::isKernel(F->getCallingConv()); } +template <typename T> std::vector<T> sortByName(std::vector<T> &&V) { + llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) { + return L->getName() < R->getName(); + }); + return {std::move(V)}; +} + class AMDGPULowerModuleLDS : public ModulePass { static void @@ -201,8 +270,7 @@ class AMDGPULowerModuleLDS : public ModulePass { LocalVar->removeDeadConstantUsers(); } - static void markUsedByKernel(IRBuilder<> &Builder, Function *Func, - GlobalVariable *SGV) { + static void markUsedByKernel(Function *Func, GlobalVariable *SGV) { // The llvm.amdgcn.module.lds instance is implicitly used by all kernels // that might call a function which accesses a field within it. This is // presently approximated to 'all kernels' if there are any such functions @@ -217,21 +285,22 @@ class AMDGPULowerModuleLDS : public ModulePass { // llvm.donothing that takes a pointer to the instance and is lowered to a // no-op after LDS is allocated, but that is not presently necessary. - LLVMContext &Ctx = Func->getContext(); - - Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI()); - - FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {}); + // This intrinsic is eliminated shortly before instruction selection. It + // does not suffice to indicate to ISel that a given global which is not + // immediately used by the kernel must still be allocated by it. An + // equivalent target specific intrinsic which lasts until immediately after + // codegen would suffice for that, but one would still need to ensure that + // the variables are allocated in the anticpated order. + IRBuilder<> Builder(Func->getEntryBlock().getFirstNonPHI()); Function *Decl = Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {}); - Value *UseInstance[1] = {Builder.CreateInBoundsGEP( - SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))}; + Value *UseInstance[1] = { + Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)}; - Builder.CreateCall(FTy, Decl, {}, - {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)}, - ""); + Builder.CreateCall( + Decl, {}, {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)}); } static bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) { @@ -240,7 +309,7 @@ class AMDGPULowerModuleLDS : public ModulePass { // This pass specialises LDS variables with respect to the kernel that // allocates them. - // This is semantically equivalent to: + // This is semantically equivalent to (the unimplemented as slow): // for (auto &F : M.functions()) // for (auto &BB : F) // for (auto &I : BB) @@ -248,63 +317,12 @@ class AMDGPULowerModuleLDS : public ModulePass { // if (constantExprUsesLDS(Op)) // replaceConstantExprInFunction(I, Op); - bool Changed = false; - - // Find all ConstantExpr that are direct users of an LDS global - SmallVector<ConstantExpr *> Stack; + SmallVector<Constant *> LDSGlobals; for (auto &GV : M.globals()) if (AMDGPU::isLDSVariableToLower(GV)) - for (User *U : GV.users()) - if (ConstantExpr *C = dyn_cast<ConstantExpr>(U)) - Stack.push_back(C); - - // Expand to include constexpr users of direct users - SetVector<ConstantExpr *> ConstExprUsersOfLDS; - while (!Stack.empty()) { - ConstantExpr *V = Stack.pop_back_val(); - if (ConstExprUsersOfLDS.contains(V)) - continue; - - ConstExprUsersOfLDS.insert(V); + LDSGlobals.push_back(&GV); - for (auto *Nested : V->users()) - if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Nested)) - Stack.push_back(CE); - } - - // Find all instructions that use any of the ConstExpr users of LDS - SetVector<Instruction *> InstructionWorklist; - for (ConstantExpr *CE : ConstExprUsersOfLDS) - for (User *U : CE->users()) - if (auto *I = dyn_cast<Instruction>(U)) - InstructionWorklist.insert(I); - - // Replace those ConstExpr operands with instructions - while (!InstructionWorklist.empty()) { - Instruction *I = InstructionWorklist.pop_back_val(); - for (Use &U : I->operands()) { - - auto *BI = I; - if (auto *Phi = dyn_cast<PHINode>(I)) { - BasicBlock *BB = Phi->getIncomingBlock(U); - BasicBlock::iterator It = BB->getFirstInsertionPt(); - assert(It != BB->end() && "Unexpected empty basic block"); - BI = &(*(It)); - } - - if (ConstantExpr *C = dyn_cast<ConstantExpr>(U.get())) { - if (ConstExprUsersOfLDS.contains(C)) { - Changed = true; - Instruction *NI = C->getAsInstruction(BI); - InstructionWorklist.insert(NI); - U.set(NI); - C->removeDeadConstantUsers(); - } - } - } - } - - return Changed; + return convertUsersOfConstantsToInstructions(LDSGlobals); } public: @@ -329,7 +347,11 @@ public: continue; } - SmallVector<User *, 16> Stack(GV.users()); + if (GV.isAbsoluteSymbolRef()) { + report_fatal_error( + "LDS variables with absolute addresses are unimplemented."); + } + for (User *V : GV.users()) { if (auto *I = dyn_cast<Instruction>(V)) { Function *F = I->getFunction(); @@ -358,11 +380,11 @@ public: DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer; for (Function &F : M.functions()) { if (!isKernelLDS(&F)) - if (F.hasAddressTaken(nullptr, - /* IgnoreCallbackUses */ false, - /* IgnoreAssumeLikeCalls */ false, - /* IgnoreLLVMUsed */ true, - /* IgnoreArcAttachedCall */ false)) { + if (F.hasAddressTaken(nullptr, + /* IgnoreCallbackUses */ false, + /* IgnoreAssumeLikeCalls */ false, + /* IgnoreLLVMUsed */ true, + /* IgnoreArcAttachedCall */ false)) { set_union(VariablesReachableThroughFunctionPointer, direct_map_function[&F]); } @@ -370,7 +392,7 @@ public: auto functionMakesUnknownCall = [&](const Function *F) -> bool { assert(!F->isDeclaration()); - for (CallGraphNode::CallRecord R : *CG[F]) { + for (const CallGraphNode::CallRecord &R : *CG[F]) { if (!R.second->getFunction()) { return true; } @@ -408,7 +430,7 @@ public: // have already been computed, with more care than this set_union(transitive_map_function[&Func], direct_map_function[F]); - for (CallGraphNode::CallRecord R : *CG[F]) { + for (const CallGraphNode::CallRecord &R : *CG[F]) { Function *ith = R.second->getFunction(); if (ith) { if (!seen.contains(ith)) { @@ -428,7 +450,7 @@ public: if (Func.isDeclaration() || !isKernelLDS(&Func)) continue; - for (CallGraphNode::CallRecord R : *CG[&Func]) { + for (const CallGraphNode::CallRecord &R : *CG[&Func]) { Function *ith = R.second->getFunction(); if (ith) { set_union(indirect_map_kernel[&Func], transitive_map_function[ith]); @@ -454,7 +476,7 @@ public: static Constant *getAddressesOfVariablesInKernel( LLVMContext &Ctx, ArrayRef<GlobalVariable *> Variables, - DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP) { + const DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP) { // Create a ConstantArray containing the address of each Variable within the // kernel corresponding to LDSVarsToConstantGEP, or poison if that kernel // does not allocate it @@ -467,8 +489,9 @@ public: SmallVector<Constant *> Elements; for (size_t i = 0; i < Variables.size(); i++) { GlobalVariable *GV = Variables[i]; - if (LDSVarsToConstantGEP.count(GV) != 0) { - auto elt = ConstantExpr::getPtrToInt(LDSVarsToConstantGEP[GV], I32); + auto ConstantGepIt = LDSVarsToConstantGEP.find(GV); + if (ConstantGepIt != LDSVarsToConstantGEP.end()) { + auto elt = ConstantExpr::getPtrToInt(ConstantGepIt->second, I32); Elements.push_back(elt); } else { Elements.push_back(PoisonValue::get(I32)); @@ -495,11 +518,15 @@ public: ArrayType *AllKernelsOffsetsType = ArrayType::get(KernelOffsetsType, NumberKernels); + Constant *Missing = PoisonValue::get(KernelOffsetsType); std::vector<Constant *> overallConstantExprElts(NumberKernels); for (size_t i = 0; i < NumberKernels; i++) { - LDSVariableReplacement Replacement = KernelToReplacement[kernels[i]]; - overallConstantExprElts[i] = getAddressesOfVariablesInKernel( - Ctx, Variables, Replacement.LDSVarsToConstantGEP); + auto Replacement = KernelToReplacement.find(kernels[i]); + overallConstantExprElts[i] = + (Replacement == KernelToReplacement.end()) + ? Missing + : getAddressesOfVariablesInKernel( + Ctx, Variables, Replacement->second.LDSVarsToConstantGEP); } Constant *init = @@ -511,36 +538,49 @@ public: AMDGPUAS::CONSTANT_ADDRESS); } - void replaceUsesInInstructionsWithTableLookup( - Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables, - GlobalVariable *LookupTable) { - + void replaceUseWithTableLookup(Module &M, IRBuilder<> &Builder, + GlobalVariable *LookupTable, + GlobalVariable *GV, Use &U, + Value *OptionalIndex) { + // Table is a constant array of the same length as OrderedKernels LLVMContext &Ctx = M.getContext(); - IRBuilder<> Builder(Ctx); Type *I32 = Type::getInt32Ty(Ctx); + auto *I = cast<Instruction>(U.getUser()); - // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which - // lowers to a read from a live in register. Emit it once in the entry - // block to spare deduplicating it later. + Value *tableKernelIndex = getTableLookupKernelIndex(M, I->getFunction()); - DenseMap<Function *, Value *> tableKernelIndexCache; - auto getTableKernelIndex = [&](Function *F) -> Value * { - if (tableKernelIndexCache.count(F) == 0) { - LLVMContext &Ctx = M.getContext(); - FunctionType *FTy = FunctionType::get(Type::getInt32Ty(Ctx), {}); - Function *Decl = - Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {}); + if (auto *Phi = dyn_cast<PHINode>(I)) { + BasicBlock *BB = Phi->getIncomingBlock(U); + Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt()))); + } else { + Builder.SetInsertPoint(I); + } - BasicBlock::iterator it = - F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); - Instruction &i = *it; - Builder.SetInsertPoint(&i); + SmallVector<Value *, 3> GEPIdx = { + ConstantInt::get(I32, 0), + tableKernelIndex, + }; + if (OptionalIndex) + GEPIdx.push_back(OptionalIndex); - tableKernelIndexCache[F] = Builder.CreateCall(FTy, Decl, {}); - } + Value *Address = Builder.CreateInBoundsGEP( + LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName()); - return tableKernelIndexCache[F]; - }; + Value *loaded = Builder.CreateLoad(I32, Address); + + Value *replacement = + Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName()); + + U.set(replacement); + } + + void replaceUsesInInstructionsWithTableLookup( + Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables, + GlobalVariable *LookupTable) { + + LLVMContext &Ctx = M.getContext(); + IRBuilder<> Builder(Ctx); + Type *I32 = Type::getInt32Ty(Ctx); for (size_t Index = 0; Index < ModuleScopeVariables.size(); Index++) { auto *GV = ModuleScopeVariables[Index]; @@ -550,32 +590,8 @@ public: if (!I) continue; - Value *tableKernelIndex = getTableKernelIndex(I->getFunction()); - - // So if the phi uses this value multiple times, what does this look - // like? - if (auto *Phi = dyn_cast<PHINode>(I)) { - BasicBlock *BB = Phi->getIncomingBlock(U); - Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt()))); - } else { - Builder.SetInsertPoint(I); - } - - Value *GEPIdx[3] = { - ConstantInt::get(I32, 0), - tableKernelIndex, - ConstantInt::get(I32, Index), - }; - - Value *Address = Builder.CreateInBoundsGEP( - LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName()); - - Value *loaded = Builder.CreateLoad(I32, Address); - - Value *replacement = - Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName()); - - U.set(replacement); + replaceUseWithTableLookup(M, Builder, LookupTable, GV, U, + ConstantInt::get(I32, Index)); } } } @@ -586,7 +602,8 @@ public: DenseSet<Function *> KernelSet; - if (VariableSet.empty()) return KernelSet; + if (VariableSet.empty()) + return KernelSet; for (Function &Func : M.functions()) { if (Func.isDeclaration() || !isKernelLDS(&Func)) @@ -649,8 +666,9 @@ public: // strategy continue; } - CandidateTy Candidate(GV, K.second.size(), - DL.getTypeAllocSize(GV->getValueType()).getFixedValue()); + CandidateTy Candidate( + GV, K.second.size(), + DL.getTypeAllocSize(GV->getValueType()).getFixedValue()); if (MostUsed < Candidate) MostUsed = Candidate; } @@ -658,173 +676,258 @@ public: return MostUsed.GV; } - bool runOnModule(Module &M) override { - LLVMContext &Ctx = M.getContext(); - CallGraph CG = CallGraph(M); - bool Changed = superAlignLDSGlobals(M); + static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV, + uint32_t Address) { + // Write the specified address into metadata where it can be retrieved by + // the assembler. Format is a half open range, [Address Address+1) + LLVMContext &Ctx = M->getContext(); + auto *IntTy = + M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); + auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address)); + auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1)); + GV->setMetadata(LLVMContext::MD_absolute_symbol, + MDNode::get(Ctx, {MinC, MaxC})); + } - Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); + DenseMap<Function *, Value *> tableKernelIndexCache; + Value *getTableLookupKernelIndex(Module &M, Function *F) { + // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which + // lowers to a read from a live in register. Emit it once in the entry + // block to spare deduplicating it later. + auto [It, Inserted] = tableKernelIndexCache.try_emplace(F); + if (Inserted) { + Function *Decl = + Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {}); - Changed = true; // todo: narrow this down + auto InsertAt = F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); + IRBuilder<> Builder(&*InsertAt); - // For each kernel, what variables does it access directly or through - // callees - LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M); + It->second = Builder.CreateCall(Decl, {}); + } - // For each variable accessed through callees, which kernels access it - VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly; - for (auto &K : LDSUsesInfo.indirect_access) { - Function *F = K.first; - assert(isKernelLDS(F)); - for (GlobalVariable *GV : K.second) { - LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F); + return It->second; + } + + static std::vector<Function *> assignLDSKernelIDToEachKernel( + Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS, + DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS) { + // Associate kernels in the set with an arbirary but reproducible order and + // annotate them with that order in metadata. This metadata is recognised by + // the backend and lowered to a SGPR which can be read from using + // amdgcn_lds_kernel_id. + + std::vector<Function *> OrderedKernels; + if (!KernelsThatAllocateTableLDS.empty() || + !KernelsThatIndirectlyAllocateDynamicLDS.empty()) { + + for (Function &Func : M->functions()) { + if (Func.isDeclaration()) + continue; + if (!isKernelLDS(&Func)) + continue; + + if (KernelsThatAllocateTableLDS.contains(&Func) || + KernelsThatIndirectlyAllocateDynamicLDS.contains(&Func)) { + assert(Func.hasName()); // else fatal error earlier + OrderedKernels.push_back(&Func); + } + } + + // Put them in an arbitrary but reproducible order + OrderedKernels = sortByName(std::move(OrderedKernels)); + + // Annotate the kernels with their order in this vector + LLVMContext &Ctx = M->getContext(); + IRBuilder<> Builder(Ctx); + + if (OrderedKernels.size() > UINT32_MAX) { + // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU + report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels"); + } + + for (size_t i = 0; i < OrderedKernels.size(); i++) { + Metadata *AttrMDArgs[1] = { + ConstantAsMetadata::get(Builder.getInt32(i)), + }; + OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id", + MDNode::get(Ctx, AttrMDArgs)); } } + return OrderedKernels; + } - // Partition variables into the different strategies - DenseSet<GlobalVariable *> ModuleScopeVariables; - DenseSet<GlobalVariable *> TableLookupVariables; - DenseSet<GlobalVariable *> KernelAccessVariables; + static void partitionVariablesIntoIndirectStrategies( + Module &M, LDSUsesInfoTy const &LDSUsesInfo, + VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly, + DenseSet<GlobalVariable *> &ModuleScopeVariables, + DenseSet<GlobalVariable *> &TableLookupVariables, + DenseSet<GlobalVariable *> &KernelAccessVariables, + DenseSet<GlobalVariable *> &DynamicVariables) { - { - GlobalVariable *HybridModuleRoot = - LoweringKindLoc != LoweringKind::hybrid - ? nullptr - : chooseBestVariableForModuleStrategy( - M.getDataLayout(), - LDSToKernelsThatNeedToAccessItIndirectly); + GlobalVariable *HybridModuleRoot = + LoweringKindLoc != LoweringKind::hybrid + ? nullptr + : chooseBestVariableForModuleStrategy( + M.getDataLayout(), LDSToKernelsThatNeedToAccessItIndirectly); - DenseSet<Function *> const EmptySet; - DenseSet<Function *> const &HybridModuleRootKernels = - HybridModuleRoot - ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot] - : EmptySet; + DenseSet<Function *> const EmptySet; + DenseSet<Function *> const &HybridModuleRootKernels = + HybridModuleRoot + ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot] + : EmptySet; - for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { - // Each iteration of this loop assigns exactly one global variable to - // exactly one of the implementation strategies. + for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) { + // Each iteration of this loop assigns exactly one global variable to + // exactly one of the implementation strategies. - GlobalVariable *GV = K.first; - assert(AMDGPU::isLDSVariableToLower(*GV)); - assert(K.second.size() != 0); + GlobalVariable *GV = K.first; + assert(AMDGPU::isLDSVariableToLower(*GV)); + assert(K.second.size() != 0); - switch (LoweringKindLoc) { - case LoweringKind::module: - ModuleScopeVariables.insert(GV); - break; + if (AMDGPU::isDynamicLDS(*GV)) { + DynamicVariables.insert(GV); + continue; + } - case LoweringKind::table: - TableLookupVariables.insert(GV); - break; + switch (LoweringKindLoc) { + case LoweringKind::module: + ModuleScopeVariables.insert(GV); + break; - case LoweringKind::kernel: - if (K.second.size() == 1) { - KernelAccessVariables.insert(GV); - } else { - report_fatal_error( - "cannot lower LDS '" + GV->getName() + - "' to kernel access as it is reachable from multiple kernels"); - } - break; + case LoweringKind::table: + TableLookupVariables.insert(GV); + break; - case LoweringKind::hybrid: { - if (GV == HybridModuleRoot) { - assert(K.second.size() != 1); - ModuleScopeVariables.insert(GV); - } else if (K.second.size() == 1) { - KernelAccessVariables.insert(GV); - } else if (set_is_subset(K.second, HybridModuleRootKernels)) { - ModuleScopeVariables.insert(GV); - } else { - TableLookupVariables.insert(GV); - } - break; + case LoweringKind::kernel: + if (K.second.size() == 1) { + KernelAccessVariables.insert(GV); + } else { + report_fatal_error( + "cannot lower LDS '" + GV->getName() + + "' to kernel access as it is reachable from multiple kernels"); } + break; + + case LoweringKind::hybrid: { + if (GV == HybridModuleRoot) { + assert(K.second.size() != 1); + ModuleScopeVariables.insert(GV); + } else if (K.second.size() == 1) { + KernelAccessVariables.insert(GV); + } else if (set_is_subset(K.second, HybridModuleRootKernels)) { + ModuleScopeVariables.insert(GV); + } else { + TableLookupVariables.insert(GV); } + break; + } } + } - assert(ModuleScopeVariables.size() + TableLookupVariables.size() + - KernelAccessVariables.size() == - LDSToKernelsThatNeedToAccessItIndirectly.size()); - } // Variables have now been partitioned into the three lowering strategies. + // All LDS variables accessed indirectly have now been partitioned into + // the distinct lowering strategies. + assert(ModuleScopeVariables.size() + TableLookupVariables.size() + + KernelAccessVariables.size() + DynamicVariables.size() == + LDSToKernelsThatNeedToAccessItIndirectly.size()); + } - // If the kernel accesses a variable that is going to be stored in the - // module instance through a call then that kernel needs to allocate the - // module instance - DenseSet<Function *> KernelsThatAllocateModuleLDS = - kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, - ModuleScopeVariables); - DenseSet<Function *> KernelsThatAllocateTableLDS = - kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, - TableLookupVariables); + static GlobalVariable *lowerModuleScopeStructVariables( + Module &M, DenseSet<GlobalVariable *> const &ModuleScopeVariables, + DenseSet<Function *> const &KernelsThatAllocateModuleLDS) { + // Create a struct to hold the ModuleScopeVariables + // Replace all uses of those variables from non-kernel functions with the + // new struct instance Replace only the uses from kernel functions that will + // allocate this instance. That is a space optimisation - kernels that use a + // subset of the module scope struct and do not need to allocate it for + // indirect calls will only allocate the subset they use (they do so as part + // of the per-kernel lowering). + if (ModuleScopeVariables.empty()) { + return nullptr; + } - if (!ModuleScopeVariables.empty()) { - LDSVariableReplacement ModuleScopeReplacement = - createLDSVariableReplacement(M, "llvm.amdgcn.module.lds", - ModuleScopeVariables); + LLVMContext &Ctx = M.getContext(); - appendToCompilerUsed(M, - {static_cast<GlobalValue *>( - ConstantExpr::getPointerBitCastOrAddrSpaceCast( - cast<Constant>(ModuleScopeReplacement.SGV), - Type::getInt8PtrTy(Ctx)))}); + LDSVariableReplacement ModuleScopeReplacement = + createLDSVariableReplacement(M, "llvm.amdgcn.module.lds", + ModuleScopeVariables); - // historic - removeLocalVarsFromUsedLists(M, ModuleScopeVariables); + appendToCompilerUsed(M, {static_cast<GlobalValue *>( + ConstantExpr::getPointerBitCastOrAddrSpaceCast( + cast<Constant>(ModuleScopeReplacement.SGV), + Type::getInt8PtrTy(Ctx)))}); - // Replace all uses of module scope variable from non-kernel functions - replaceLDSVariablesWithStruct( - M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) { - Instruction *I = dyn_cast<Instruction>(U.getUser()); - if (!I) { - return false; - } - Function *F = I->getFunction(); - return !isKernelLDS(F); - }); + // module.lds will be allocated at zero in any kernel that allocates it + recordLDSAbsoluteAddress(&M, ModuleScopeReplacement.SGV, 0); - // Replace uses of module scope variable from kernel functions that - // allocate the module scope variable, otherwise leave them unchanged - // Record on each kernel whether the module scope global is used by it + // historic + removeLocalVarsFromUsedLists(M, ModuleScopeVariables); - LLVMContext &Ctx = M.getContext(); - IRBuilder<> Builder(Ctx); + // Replace all uses of module scope variable from non-kernel functions + replaceLDSVariablesWithStruct( + M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) { + Instruction *I = dyn_cast<Instruction>(U.getUser()); + if (!I) { + return false; + } + Function *F = I->getFunction(); + return !isKernelLDS(F); + }); - for (Function &Func : M.functions()) { - if (Func.isDeclaration() || !isKernelLDS(&Func)) - continue; + // Replace uses of module scope variable from kernel functions that + // allocate the module scope variable, otherwise leave them unchanged + // Record on each kernel whether the module scope global is used by it - if (KernelsThatAllocateModuleLDS.contains(&Func)) { - replaceLDSVariablesWithStruct( - M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) { - Instruction *I = dyn_cast<Instruction>(U.getUser()); - if (!I) { - return false; - } - Function *F = I->getFunction(); - return F == &Func; - }); + for (Function &Func : M.functions()) { + if (Func.isDeclaration() || !isKernelLDS(&Func)) + continue; - markUsedByKernel(Builder, &Func, ModuleScopeReplacement.SGV); + if (KernelsThatAllocateModuleLDS.contains(&Func)) { + replaceLDSVariablesWithStruct( + M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) { + Instruction *I = dyn_cast<Instruction>(U.getUser()); + if (!I) { + return false; + } + Function *F = I->getFunction(); + return F == &Func; + }); - } else { - Func.addFnAttr("amdgpu-elide-module-lds"); - } + markUsedByKernel(&Func, ModuleScopeReplacement.SGV); } } - // Create a struct for each kernel for the non-module-scope variables + return ModuleScopeReplacement.SGV; + } + + static DenseMap<Function *, LDSVariableReplacement> + lowerKernelScopeStructVariables( + Module &M, LDSUsesInfoTy &LDSUsesInfo, + DenseSet<GlobalVariable *> const &ModuleScopeVariables, + DenseSet<Function *> const &KernelsThatAllocateModuleLDS, + GlobalVariable *MaybeModuleScopeStruct) { + + // Create a struct for each kernel for the non-module-scope variables. + DenseMap<Function *, LDSVariableReplacement> KernelToReplacement; for (Function &Func : M.functions()) { if (Func.isDeclaration() || !isKernelLDS(&Func)) continue; DenseSet<GlobalVariable *> KernelUsedVariables; + // Allocating variables that are used directly in this struct to get + // alignment aware allocation and predictable frame size. for (auto &v : LDSUsesInfo.direct_access[&Func]) { - KernelUsedVariables.insert(v); + if (!AMDGPU::isDynamicLDS(*v)) { + KernelUsedVariables.insert(v); + } } + + // Allocating variables that are accessed indirectly so that a lookup of + // this struct instance can find them from nested functions. for (auto &v : LDSUsesInfo.indirect_access[&Func]) { - KernelUsedVariables.insert(v); + if (!AMDGPU::isDynamicLDS(*v)) { + KernelUsedVariables.insert(v); + } } // Variables allocated in module lds must all resolve to that struct, @@ -836,7 +939,8 @@ public: } if (KernelUsedVariables.empty()) { - // Either used no LDS, or all the LDS it used was also in module + // Either used no LDS, or the LDS it used was all in the module struct + // or dynamically sized continue; } @@ -856,6 +960,14 @@ public: auto Replacement = createLDSVariableReplacement(M, VarName, KernelUsedVariables); + // If any indirect uses, create a direct use to ensure allocation + // TODO: Simpler to unconditionally mark used but that regresses + // codegen in test/CodeGen/AMDGPU/noclobber-barrier.ll + auto Accesses = LDSUsesInfo.indirect_access.find(&Func); + if ((Accesses != LDSUsesInfo.indirect_access.end()) && + !Accesses->second.empty()) + markUsedByKernel(&Func, Replacement.SGV); + // remove preserves existing codegen removeLocalVarsFromUsedLists(M, KernelUsedVariables); KernelToReplacement[&Func] = Replacement; @@ -867,6 +979,169 @@ public: return I && I->getFunction() == &Func; }); } + return KernelToReplacement; + } + + static GlobalVariable * + buildRepresentativeDynamicLDSInstance(Module &M, LDSUsesInfoTy &LDSUsesInfo, + Function *func) { + // Create a dynamic lds variable with a name associated with the passed + // function that has the maximum alignment of any dynamic lds variable + // reachable from this kernel. Dynamic LDS is allocated after the static LDS + // allocation, possibly after alignment padding. The representative variable + // created here has the maximum alignment of any other dynamic variable + // reachable by that kernel. All dynamic LDS variables are allocated at the + // same address in each kernel in order to provide the documented aliasing + // semantics. Setting the alignment here allows this IR pass to accurately + // predict the exact constant at which it will be allocated. + + assert(isKernelLDS(func)); + + LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + Align MaxDynamicAlignment(1); + + auto UpdateMaxAlignment = [&MaxDynamicAlignment, &DL](GlobalVariable *GV) { + if (AMDGPU::isDynamicLDS(*GV)) { + MaxDynamicAlignment = + std::max(MaxDynamicAlignment, AMDGPU::getAlign(DL, GV)); + } + }; + + for (GlobalVariable *GV : LDSUsesInfo.indirect_access[func]) { + UpdateMaxAlignment(GV); + } + + for (GlobalVariable *GV : LDSUsesInfo.direct_access[func]) { + UpdateMaxAlignment(GV); + } + + assert(func->hasName()); // Checked by caller + auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0); + GlobalVariable *N = new GlobalVariable( + M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr, + Twine("llvm.amdgcn." + func->getName() + ".dynlds"), nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, + false); + N->setAlignment(MaxDynamicAlignment); + + assert(AMDGPU::isDynamicLDS(*N)); + return N; + } + + DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables( + Module &M, LDSUsesInfoTy &LDSUsesInfo, + DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS, + DenseSet<GlobalVariable *> const &DynamicVariables, + std::vector<Function *> const &OrderedKernels) { + DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS; + if (!KernelsThatIndirectlyAllocateDynamicLDS.empty()) { + LLVMContext &Ctx = M.getContext(); + IRBuilder<> Builder(Ctx); + Type *I32 = Type::getInt32Ty(Ctx); + + std::vector<Constant *> newDynamicLDS; + + // Table is built in the same order as OrderedKernels + for (auto &func : OrderedKernels) { + + if (KernelsThatIndirectlyAllocateDynamicLDS.contains(func)) { + assert(isKernelLDS(func)); + if (!func->hasName()) { + report_fatal_error("Anonymous kernels cannot use LDS variables"); + } + + GlobalVariable *N = + buildRepresentativeDynamicLDSInstance(M, LDSUsesInfo, func); + + KernelToCreatedDynamicLDS[func] = N; + + markUsedByKernel(func, N); + + auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0); + auto GEP = ConstantExpr::getGetElementPtr( + emptyCharArray, N, ConstantInt::get(I32, 0), true); + newDynamicLDS.push_back(ConstantExpr::getPtrToInt(GEP, I32)); + } else { + newDynamicLDS.push_back(PoisonValue::get(I32)); + } + } + assert(OrderedKernels.size() == newDynamicLDS.size()); + + ArrayType *t = ArrayType::get(I32, newDynamicLDS.size()); + Constant *init = ConstantArray::get(t, newDynamicLDS); + GlobalVariable *table = new GlobalVariable( + M, t, true, GlobalValue::InternalLinkage, init, + "llvm.amdgcn.dynlds.offset.table", nullptr, + GlobalValue::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS); + + for (GlobalVariable *GV : DynamicVariables) { + for (Use &U : make_early_inc_range(GV->uses())) { + auto *I = dyn_cast<Instruction>(U.getUser()); + if (!I) + continue; + if (isKernelLDS(I->getFunction())) + continue; + + replaceUseWithTableLookup(M, Builder, table, GV, U, nullptr); + } + } + } + return KernelToCreatedDynamicLDS; + } + + bool runOnModule(Module &M) override { + CallGraph CG = CallGraph(M); + bool Changed = superAlignLDSGlobals(M); + + Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M); + + Changed = true; // todo: narrow this down + + // For each kernel, what variables does it access directly or through + // callees + LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M); + + // For each variable accessed through callees, which kernels access it + VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly; + for (auto &K : LDSUsesInfo.indirect_access) { + Function *F = K.first; + assert(isKernelLDS(F)); + for (GlobalVariable *GV : K.second) { + LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F); + } + } + + // Partition variables accessed indirectly into the different strategies + DenseSet<GlobalVariable *> ModuleScopeVariables; + DenseSet<GlobalVariable *> TableLookupVariables; + DenseSet<GlobalVariable *> KernelAccessVariables; + DenseSet<GlobalVariable *> DynamicVariables; + partitionVariablesIntoIndirectStrategies( + M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly, + ModuleScopeVariables, TableLookupVariables, KernelAccessVariables, + DynamicVariables); + + // If the kernel accesses a variable that is going to be stored in the + // module instance through a call then that kernel needs to allocate the + // module instance + const DenseSet<Function *> KernelsThatAllocateModuleLDS = + kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, + ModuleScopeVariables); + const DenseSet<Function *> KernelsThatAllocateTableLDS = + kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, + TableLookupVariables); + + const DenseSet<Function *> KernelsThatIndirectlyAllocateDynamicLDS = + kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo, + DynamicVariables); + + GlobalVariable *MaybeModuleScopeStruct = lowerModuleScopeStructVariables( + M, ModuleScopeVariables, KernelsThatAllocateModuleLDS); + + DenseMap<Function *, LDSVariableReplacement> KernelToReplacement = + lowerKernelScopeStructVariables(M, LDSUsesInfo, ModuleScopeVariables, + KernelsThatAllocateModuleLDS, + MaybeModuleScopeStruct); // Lower zero cost accesses to the kernel instances just created for (auto &GV : KernelAccessVariables) { @@ -879,72 +1154,113 @@ public: Vec.insert(GV); replaceLDSVariablesWithStruct(M, Vec, Replacement, [](Use &U) { - return isa<Instruction>(U.getUser()); + return isa<Instruction>(U.getUser()); }); } + // The ith element of this vector is kernel id i + std::vector<Function *> OrderedKernels = + assignLDSKernelIDToEachKernel(&M, KernelsThatAllocateTableLDS, + KernelsThatIndirectlyAllocateDynamicLDS); + if (!KernelsThatAllocateTableLDS.empty()) { - // Collect the kernels that allocate table lookup LDS - std::vector<Function *> OrderedKernels; - { - for (Function &Func : M.functions()) { - if (Func.isDeclaration()) - continue; - if (!isKernelLDS(&Func)) - continue; + LLVMContext &Ctx = M.getContext(); + IRBuilder<> Builder(Ctx); - if (KernelsThatAllocateTableLDS.contains(&Func)) { - assert(Func.hasName()); // else fatal error earlier - OrderedKernels.push_back(&Func); - } - } + // The order must be consistent between lookup table and accesses to + // lookup table + auto TableLookupVariablesOrdered = + sortByName(std::vector<GlobalVariable *>(TableLookupVariables.begin(), + TableLookupVariables.end())); + + GlobalVariable *LookupTable = buildLookupTable( + M, TableLookupVariablesOrdered, OrderedKernels, KernelToReplacement); + replaceUsesInInstructionsWithTableLookup(M, TableLookupVariablesOrdered, + LookupTable); + } - // Put them in an arbitrary but reproducible order - llvm::sort(OrderedKernels.begin(), OrderedKernels.end(), - [](const Function *lhs, const Function *rhs) -> bool { - return lhs->getName() < rhs->getName(); - }); + DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS = + lowerDynamicLDSVariables(M, LDSUsesInfo, + KernelsThatIndirectlyAllocateDynamicLDS, + DynamicVariables, OrderedKernels); + + // All kernel frames have been allocated. Calculate and record the + // addresses. + { + const DataLayout &DL = M.getDataLayout(); + + for (Function &Func : M.functions()) { + if (Func.isDeclaration() || !isKernelLDS(&Func)) + continue; - // Annotate the kernels with their order in this vector - LLVMContext &Ctx = M.getContext(); - IRBuilder<> Builder(Ctx); + // All three of these are optional. The first variable is allocated at + // zero. They are allocated by AMDGPUMachineFunction as one block. + // Layout: + //{ + // module.lds + // alignment padding + // kernel instance + // alignment padding + // dynamic lds variables + //} - if (OrderedKernels.size() > UINT32_MAX) { - // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU - report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels"); + const bool AllocateModuleScopeStruct = + MaybeModuleScopeStruct && + KernelsThatAllocateModuleLDS.contains(&Func); + + auto Replacement = KernelToReplacement.find(&Func); + const bool AllocateKernelScopeStruct = + Replacement != KernelToReplacement.end(); + + const bool AllocateDynamicVariable = + KernelToCreatedDynamicLDS.contains(&Func); + + uint32_t Offset = 0; + + if (AllocateModuleScopeStruct) { + // Allocated at zero, recorded once on construction, not once per + // kernel + Offset += DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType()); } - for (size_t i = 0; i < OrderedKernels.size(); i++) { - Metadata *AttrMDArgs[1] = { - ConstantAsMetadata::get(Builder.getInt32(i)), - }; - OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id", - MDNode::get(Ctx, AttrMDArgs)); + if (AllocateKernelScopeStruct) { + GlobalVariable *KernelStruct = Replacement->second.SGV; + Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct)); + recordLDSAbsoluteAddress(&M, KernelStruct, Offset); + Offset += DL.getTypeAllocSize(KernelStruct->getValueType()); + } - markUsedByKernel(Builder, OrderedKernels[i], - KernelToReplacement[OrderedKernels[i]].SGV); + // If there is dynamic allocation, the alignment needed is included in + // the static frame size. There may be no reference to the dynamic + // variable in the kernel itself, so without including it here, that + // alignment padding could be missed. + if (AllocateDynamicVariable) { + GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func]; + Offset = alignTo(Offset, AMDGPU::getAlign(DL, DynamicVariable)); + recordLDSAbsoluteAddress(&M, DynamicVariable, Offset); } - } - // The order must be consistent between lookup table and accesses to - // lookup table - std::vector<GlobalVariable *> TableLookupVariablesOrdered( - TableLookupVariables.begin(), TableLookupVariables.end()); - llvm::sort(TableLookupVariablesOrdered.begin(), - TableLookupVariablesOrdered.end(), - [](const GlobalVariable *lhs, const GlobalVariable *rhs) { - return lhs->getName() < rhs->getName(); - }); + if (Offset != 0) { + std::string Buffer; + raw_string_ostream SS{Buffer}; + SS << format("%u", Offset); - GlobalVariable *LookupTable = buildLookupTable( - M, TableLookupVariablesOrdered, OrderedKernels, KernelToReplacement); - replaceUsesInInstructionsWithTableLookup(M, TableLookupVariablesOrdered, - LookupTable); + // Instead of explictly marking kernels that access dynamic variables + // using special case metadata, annotate with min-lds == max-lds, i.e. + // that there is no more space available for allocating more static + // LDS variables. That is the right condition to prevent allocating + // more variables which would collide with the addresses assigned to + // dynamic variables. + if (AllocateDynamicVariable) + SS << format(",%u", Offset); + + Func.addFnAttr("amdgpu-lds-size", Buffer); + } + } } for (auto &GV : make_early_inc_range(M.globals())) if (AMDGPU::isLDSVariableToLower(GV)) { - // probably want to remove from used lists GV.removeDeadConstantUsers(); if (GV.use_empty()) @@ -1017,12 +1333,9 @@ private: // The order of fields in this struct depends on the order of // varables in the argument which varies when changing how they // are identified, leading to spurious test breakage. - std::vector<GlobalVariable *> Sorted(LDSVarsToTransform.begin(), - LDSVarsToTransform.end()); - llvm::sort(Sorted.begin(), Sorted.end(), - [](const GlobalVariable *lhs, const GlobalVariable *rhs) { - return lhs->getName() < rhs->getName(); - }); + auto Sorted = sortByName(std::vector<GlobalVariable *>( + LDSVarsToTransform.begin(), LDSVarsToTransform.end())); + for (GlobalVariable *GV : Sorted) { OptimizedStructLayoutField F(GV, DL.getTypeAllocSize(GV->getValueType()), @@ -1101,21 +1414,17 @@ private: } template <typename PredicateTy> - void replaceLDSVariablesWithStruct( + static void replaceLDSVariablesWithStruct( Module &M, DenseSet<GlobalVariable *> const &LDSVarsToTransformArg, - LDSVariableReplacement Replacement, PredicateTy Predicate) { + const LDSVariableReplacement &Replacement, PredicateTy Predicate) { LLVMContext &Ctx = M.getContext(); const DataLayout &DL = M.getDataLayout(); // A hack... we need to insert the aliasing info in a predictable order for // lit tests. Would like to have them in a stable order already, ideally the // same order they get allocated, which might mean an ordered set container - std::vector<GlobalVariable *> LDSVarsToTransform( - LDSVarsToTransformArg.begin(), LDSVarsToTransformArg.end()); - llvm::sort(LDSVarsToTransform.begin(), LDSVarsToTransform.end(), - [](const GlobalVariable *lhs, const GlobalVariable *rhs) { - return lhs->getName() < rhs->getName(); - }); + auto LDSVarsToTransform = sortByName(std::vector<GlobalVariable *>( + LDSVarsToTransformArg.begin(), LDSVarsToTransformArg.end())); // Create alias.scope and their lists. Each field in the new structure // does not alias with all other fields. @@ -1137,7 +1446,7 @@ private: // field of the instance that will be allocated by AMDGPUMachineFunction for (size_t I = 0; I < NumberVars; I++) { GlobalVariable *GV = LDSVarsToTransform[I]; - Constant *GEP = Replacement.LDSVarsToConstantGEP[GV]; + Constant *GEP = Replacement.LDSVarsToConstantGEP.at(GV); GV->replaceUsesWithIf(GEP, Predicate); @@ -1159,9 +1468,9 @@ private: } } - void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL, - MDNode *AliasScope, MDNode *NoAlias, - unsigned MaxDepth = 5) { + static void refineUsesAlignmentAndAA(Value *Ptr, Align A, + const DataLayout &DL, MDNode *AliasScope, + MDNode *NoAlias, unsigned MaxDepth = 5) { if (!MaxDepth || (A == 1 && !AliasScope)) return; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index d88a2cd961b2..c24d39b9e5fd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -13,6 +13,7 @@ // #include "AMDGPUMCInstLower.h" +#include "AMDGPU.h" #include "AMDGPUAsmPrinter.h" #include "AMDGPUMachineFunction.h" #include "AMDGPUTargetMachine.h" @@ -133,7 +134,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.addOperand(Dest); OutMI.addOperand(Src); return; - } else if (Opcode == AMDGPU::SI_TCRETURN) { + } else if (Opcode == AMDGPU::SI_TCRETURN || + Opcode == AMDGPU::SI_TCRETURN_GFX) { // TODO: How to use branch immediate and avoid register+add? Opcode = AMDGPU::S_SETPC_B64; } @@ -168,12 +170,11 @@ bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO, const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) { // Intercept LDS variables with known addresses - if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(CV)) { - if (AMDGPUMachineFunction::isKnownAddressLDSGlobal(*GV)) { - unsigned offset = - AMDGPUMachineFunction::calculateKnownAddressOfLDSGlobal(*GV); - Constant *C = ConstantInt::get(CV->getContext(), APInt(32, offset)); - return AsmPrinter::lowerConstant(C); + if (const GlobalVariable *GV = dyn_cast<const GlobalVariable>(CV)) { + if (std::optional<uint32_t> Address = + AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) { + auto *IntTy = Type::getInt32Ty(CV->getContext()); + return AsmPrinter::lowerConstant(ConstantInt::get(IntTy, *Address)); } } @@ -285,11 +286,10 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { (!STI.hasOffset3fBug() || !MI->isBranch())) { SmallVector<MCFixup, 4> Fixups; SmallVector<char, 16> CodeBytes; - raw_svector_ostream CodeStream(CodeBytes); - std::unique_ptr<MCCodeEmitter> InstEmitter(createSIMCCodeEmitter( + std::unique_ptr<MCCodeEmitter> InstEmitter(createAMDGPUMCCodeEmitter( *STI.getInstrInfo(), OutContext)); - InstEmitter->encodeInstruction(TmpInst, CodeStream, Fixups, STI); + InstEmitter->encodeInstruction(TmpInst, CodeBytes, Fixups, STI); assert(CodeBytes.size() == STI.getInstrInfo()->getInstSizeInBytes(*MI)); } @@ -308,10 +308,9 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) { // Disassemble instruction/operands to hex representation. SmallVector<MCFixup, 4> Fixups; SmallVector<char, 16> CodeBytes; - raw_svector_ostream CodeStream(CodeBytes); DumpCodeInstEmitter->encodeInstruction( - TmpInst, CodeStream, Fixups, MF->getSubtarget<MCSubtargetInfo>()); + TmpInst, CodeBytes, Fixups, MF->getSubtarget<MCSubtargetInfo>()); HexLines.resize(HexLines.size() + 1); std::string &HexLine = HexLines.back(); raw_string_ostream HexStream(HexLine); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp index c96fab08a267..d90fcac87540 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp @@ -2600,9 +2600,6 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) { LLVM_DEBUG(dbgs() << "CurrentRegion: \n"); LLVM_DEBUG(LRegion->print(dbgs(), TRI)); - auto CNI = CI; - ++CNI; - MRT *Child = (*CI); if (Child->isRegion()) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index a6a32b98f44c..44bbfe6f13d9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -10,8 +10,11 @@ #include "AMDGPU.h" #include "AMDGPUPerfHintAnalysis.h" #include "AMDGPUSubtarget.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/Metadata.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; @@ -41,6 +44,18 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F, // Assume the attribute allocates before any known GDS globals. StaticGDSSize = GDSSize; + // Second value, if present, is the maximum value that can be assigned. + // Useful in PromoteAlloca or for LDS spills. Could be used for diagnostics + // during codegen. + std::pair<unsigned, unsigned> LDSSizeRange = AMDGPU::getIntegerPairAttribute( + F, "amdgpu-lds-size", {0, UINT32_MAX}, true); + + // The two separate variables are only profitable when the LDS module lowering + // pass is disabled. If graphics does not use dynamic LDS, this is never + // profitable. Leaving cleanup for a later change. + LDSSize = LDSSizeRange.first; + StaticLDSSize = LDSSize; + CallingConv::ID CC = F.getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign); @@ -63,6 +78,42 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, unsigned Offset; if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + + std::optional<uint32_t> MaybeAbs = getLDSAbsoluteAddress(GV); + if (MaybeAbs) { + // Absolute address LDS variables that exist prior to the LDS lowering + // pass raise a fatal error in that pass. These failure modes are only + // reachable if that lowering pass is disabled or broken. If/when adding + // support for absolute addresses on user specified variables, the + // alignment check moves to the lowering pass and the frame calculation + // needs to take the user variables into consideration. + + uint32_t ObjectStart = *MaybeAbs; + + if (ObjectStart != alignTo(ObjectStart, Alignment)) { + report_fatal_error("Absolute address LDS variable inconsistent with " + "variable alignment"); + } + + if (isModuleEntryFunction()) { + // If this is a module entry function, we can also sanity check against + // the static frame. Strictly it would be better to check against the + // attribute, i.e. that the variable is within the always-allocated + // section, and not within some other non-absolute-address object + // allocated here, but the extra error detection is minimal and we would + // have to pass the Function around or cache the attribute value. + uint32_t ObjectEnd = + ObjectStart + DL.getTypeAllocSize(GV.getValueType()); + if (ObjectEnd > StaticLDSSize) { + report_fatal_error( + "Absolute address LDS variable outside of static frame"); + } + } + + Entry.first->second = ObjectStart; + return ObjectStart; + } + /// TODO: We should sort these to minimize wasted space due to alignment /// padding. Currently the padding is decided by the first encountered use /// during lowering. @@ -87,135 +138,54 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, return Offset; } -static constexpr StringLiteral ModuleLDSName = "llvm.amdgcn.module.lds"; - -bool AMDGPUMachineFunction::isKnownAddressLDSGlobal(const GlobalVariable &GV) { - auto name = GV.getName(); - return (name == ModuleLDSName) || - (name.startswith("llvm.amdgcn.kernel.") && name.endswith(".lds")); -} - -const Function *AMDGPUMachineFunction::getKernelLDSFunctionFromGlobal( - const GlobalVariable &GV) { - const Module &M = *GV.getParent(); - StringRef N(GV.getName()); - if (N.consume_front("llvm.amdgcn.kernel.") && N.consume_back(".lds")) { - return M.getFunction(N); - } - return nullptr; -} - -const GlobalVariable * -AMDGPUMachineFunction::getKernelLDSGlobalFromFunction(const Function &F) { +static const GlobalVariable * +getKernelDynLDSGlobalFromFunction(const Function &F) { const Module *M = F.getParent(); - std::string KernelLDSName = "llvm.amdgcn.kernel."; - KernelLDSName += F.getName(); - KernelLDSName += ".lds"; - return M->getNamedGlobal(KernelLDSName); + std::string KernelDynLDSName = "llvm.amdgcn."; + KernelDynLDSName += F.getName(); + KernelDynLDSName += ".dynlds"; + return M->getNamedGlobal(KernelDynLDSName); } -// This kernel calls no functions that require the module lds struct -static bool canElideModuleLDS(const Function &F) { - return F.hasFnAttribute("amdgpu-elide-module-lds"); -} - -unsigned AMDGPUMachineFunction::calculateKnownAddressOfLDSGlobal( - const GlobalVariable &GV) { - // module.lds, then alignment padding, then kernel.lds, then other variables - // if any - - assert(isKnownAddressLDSGlobal(GV)); - unsigned Offset = 0; - - if (GV.getName() == ModuleLDSName) { - return 0; - } - - const Module *M = GV.getParent(); - const DataLayout &DL = M->getDataLayout(); - - const GlobalVariable *GVM = M->getNamedGlobal(ModuleLDSName); - const Function *f = getKernelLDSFunctionFromGlobal(GV); - - // Account for module.lds if allocated for this function - if (GVM && f && !canElideModuleLDS(*f)) { - // allocator aligns this to var align, but it's zero to begin with - Offset += DL.getTypeAllocSize(GVM->getValueType()); +std::optional<uint32_t> +AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) { + // TODO: Would be more consistent with the abs symbols to use a range + MDNode *MD = F.getMetadata("llvm.amdgcn.lds.kernel.id"); + if (MD && MD->getNumOperands() == 1) { + if (ConstantInt *KnownSize = + mdconst::extract<ConstantInt>(MD->getOperand(0))) { + uint64_t ZExt = KnownSize->getZExtValue(); + if (ZExt <= UINT32_MAX) { + return ZExt; + } + } } - - // No dynamic LDS alignment done by allocateModuleLDSGlobal - Offset = alignTo( - Offset, DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType())); - - return Offset; + return {}; } -void AMDGPUMachineFunction::allocateKnownAddressLDSGlobal(const Function &F) { - const Module *M = F.getParent(); - - // This function is called before allocating any other LDS so that it can - // reliably put values at known addresses. Consequently, dynamic LDS, if - // present, will not yet have been allocated - - assert(getDynLDSAlign() == Align() && "dynamic LDS not yet allocated"); - - if (isModuleEntryFunction()) { - - // Pointer values start from zero, memory allocated per-kernel-launch - // Variables can be grouped into a module level struct and a struct per - // kernel function by AMDGPULowerModuleLDSPass. If that is done, they - // are allocated at statically computable addresses here. - // - // Address 0 - // { - // llvm.amdgcn.module.lds - // } - // alignment padding - // { - // llvm.amdgcn.kernel.some-name.lds - // } - // other variables, e.g. dynamic lds, allocated after this call +std::optional<uint32_t> +AMDGPUMachineFunction::getLDSAbsoluteAddress(const GlobalValue &GV) { + if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) + return {}; - const GlobalVariable *GV = M->getNamedGlobal(ModuleLDSName); - const GlobalVariable *KV = getKernelLDSGlobalFromFunction(F); + std::optional<ConstantRange> AbsSymRange = GV.getAbsoluteSymbolRange(); + if (!AbsSymRange) + return {}; - if (GV && !canElideModuleLDS(F)) { - assert(isKnownAddressLDSGlobal(*GV)); - unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV, Align()); - (void)Offset; - assert(Offset == calculateKnownAddressOfLDSGlobal(*GV) && - "Module LDS expected to be allocated before other LDS"); - } - - if (KV) { - // The per-kernel offset is deterministic because it is allocated - // before any other non-module LDS variables. - assert(isKnownAddressLDSGlobal(*KV)); - unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *KV, Align()); - (void)Offset; - assert(Offset == calculateKnownAddressOfLDSGlobal(*KV) && - "Kernel LDS expected to be immediately after module LDS"); + if (const APInt *V = AbsSymRange->getSingleElement()) { + std::optional<uint64_t> ZExt = V->tryZExtValue(); + if (ZExt && (*ZExt <= UINT32_MAX)) { + return *ZExt; } } -} -std::optional<uint32_t> -AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) { - auto MD = F.getMetadata("llvm.amdgcn.lds.kernel.id"); - if (MD && MD->getNumOperands() == 1) { - ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(0)); - if (KnownSize) { - uint64_t V = KnownSize->getZExtValue(); - if (V <= UINT32_MAX) { - return V; - } - } - } return {}; } -void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL, +void AMDGPUMachineFunction::setDynLDSAlign(const Function &F, const GlobalVariable &GV) { + const Module *M = F.getParent(); + const DataLayout &DL = M->getDataLayout(); assert(DL.getTypeAllocSize(GV.getValueType()).isZero()); Align Alignment = @@ -225,4 +195,17 @@ void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL, LDSSize = alignTo(StaticLDSSize, Alignment); DynLDSAlign = Alignment; + + // If there is a dynamic LDS variable associated with this function F, every + // further dynamic LDS instance (allocated by calling setDynLDSAlign) must + // map to the same address. This holds because no LDS is allocated after the + // lowering pass if there are dynamic LDS variables present. + const GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(F); + if (Dyn) { + unsigned Offset = LDSSize; // return this? + std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*Dyn); + if (!Expect || (Offset != *Expect)) { + report_fatal_error("Inconsistent metadata on dynamic LDS variable"); + } + } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index f27f8252a4d8..5780fa64a7e4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -104,26 +104,12 @@ public: unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV, Align Trailing); - void allocateKnownAddressLDSGlobal(const Function &F); - - // A kernel function may have an associated LDS allocation, and a kernel-scope - // LDS allocation must have an associated kernel function - - // LDS allocation should have an associated kernel function - static const Function * - getKernelLDSFunctionFromGlobal(const GlobalVariable &GV); - static const GlobalVariable * - getKernelLDSGlobalFromFunction(const Function &F); - - // Module or kernel scope LDS variable - static bool isKnownAddressLDSGlobal(const GlobalVariable &GV); - static unsigned calculateKnownAddressOfLDSGlobal(const GlobalVariable &GV); - static std::optional<uint32_t> getLDSKernelIdMetadata(const Function &F); + static std::optional<uint32_t> getLDSAbsoluteAddress(const GlobalValue &GV); Align getDynLDSAlign() const { return DynLDSAlign; } - void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV); + void setDynLDSAlign(const Function &F, const GlobalVariable &GV); }; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp index 98c5c96cd4b2..2092707c8a3f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -72,31 +72,6 @@ ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() { return new AMDGPUOpenCLEnqueuedBlockLowering(); } -/// Collect direct or indirect callers of \p F and save them -/// to \p Callers. -static void collectCallers(Function *F, DenseSet<Function *> &Callers) { - for (auto *U : F->users()) { - if (auto *CI = dyn_cast<CallInst>(&*U)) { - auto *Caller = CI->getParent()->getParent(); - if (Callers.insert(Caller).second) - collectCallers(Caller, Callers); - } - } -} - -/// If \p U is instruction or constant, collect functions which directly or -/// indirectly use it. -static void collectFunctionUsers(User *U, DenseSet<Function *> &Funcs) { - if (auto *I = dyn_cast<Instruction>(U)) { - auto *F = I->getParent()->getParent(); - if (Funcs.insert(F).second) - collectCallers(F, Funcs); - return; - } - for (User *U : U->users()) - collectFunctionUsers(U, Funcs); -} - bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { DenseSet<Function *> Callers; auto &C = M.getContext(); @@ -131,9 +106,6 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { /*isExternallyInitialized=*/true); LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n'); - for (User *U : F.users()) - collectFunctionUsers(U, Callers); - F.replaceAllUsesWith(ConstantExpr::getAddrSpaceCast(GV, F.getType())); F.addFnAttr("runtime-handle", RuntimeHandle); F.setLinkage(GlobalValue::ExternalLinkage); @@ -141,15 +113,5 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { } } - // FIXME: This call graph analysis is broken and should be - // removed. AMDGPUAttributor infers the individual implicit argument fields - // are needed or not, but the runtime crashes in cases where we fail to - // optimize these out at -O0. - for (auto *F : Callers) { - if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL) - continue; - F->addFnAttr("calls-enqueue-kernel"); - LLVM_DEBUG(dbgs() << "mark enqueue_kernel caller:" << F->getName() << '\n'); - } return Changed; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index 9c04df0b3683..536fb02cb4ec 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -1,4 +1,4 @@ -//=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===// +//=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp --------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -19,6 +19,8 @@ #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" +#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h" +#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineDominators.h" @@ -26,22 +28,41 @@ #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Target/TargetMachine.h" +#define GET_GICOMBINER_DEPS +#include "AMDGPUGenPreLegalizeGICombiner.inc" +#undef GET_GICOMBINER_DEPS + #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" using namespace llvm; using namespace MIPatternMatch; -class AMDGPUPostLegalizerCombinerHelper { +namespace { +#define GET_GICOMBINER_TYPES +#include "AMDGPUGenPostLegalizeGICombiner.inc" +#undef GET_GICOMBINER_TYPES + +class AMDGPUPostLegalizerCombinerImpl : public GIMatchTableExecutor { protected: + const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig; + MachineIRBuilder &B; MachineFunction &MF; MachineRegisterInfo &MRI; + const GCNSubtarget &STI; + const SIInstrInfo &TII; AMDGPUCombinerHelper &Helper; + GISelChangeObserver &Observer; public: - AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, - AMDGPUCombinerHelper &Helper) - : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; + AMDGPUPostLegalizerCombinerImpl( + const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, + MachineIRBuilder &B, AMDGPUCombinerHelper &Helper, + GISelChangeObserver &Observer); + + static const char *getName() { return "AMDGPUPostLegalizerCombinerImpl"; } + + bool tryCombineAll(MachineInstr &I) const; struct FMinFMaxLegacyInfo { Register LHS; @@ -52,15 +73,16 @@ public: }; // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize - bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info); + bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info) const; void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, - const FMinFMaxLegacyInfo &Info); + const FMinFMaxLegacyInfo &Info) const; - bool matchUCharToFloat(MachineInstr &MI); - void applyUCharToFloat(MachineInstr &MI); + bool matchUCharToFloat(MachineInstr &MI) const; + void applyUCharToFloat(MachineInstr &MI) const; - bool matchRcpSqrtToRsq(MachineInstr &MI, - std::function<void(MachineIRBuilder &)> &MatchInfo); + bool + matchRcpSqrtToRsq(MachineInstr &MI, + std::function<void(MachineIRBuilder &)> &MatchInfo) const; // FIXME: Should be able to have 2 separate matchdatas rather than custom // struct boilerplate. @@ -69,15 +91,49 @@ public: unsigned ShiftOffset; }; - bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo); + bool matchCvtF32UByteN(MachineInstr &MI, + CvtF32UByteMatchInfo &MatchInfo) const; void applyCvtF32UByteN(MachineInstr &MI, - const CvtF32UByteMatchInfo &MatchInfo); + const CvtF32UByteMatchInfo &MatchInfo) const; + + bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg) const; - bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg); + // Combine unsigned buffer load and signed extension instructions to generate + // signed buffer laod instructions. + bool matchCombineSignExtendInReg(MachineInstr &MI, + MachineInstr *&MatchInfo) const; + void applyCombineSignExtendInReg(MachineInstr &MI, + MachineInstr *&MatchInfo) const; + +private: +#define GET_GICOMBINER_CLASS_MEMBERS +#define AMDGPUSubtarget GCNSubtarget +#include "AMDGPUGenPostLegalizeGICombiner.inc" +#undef GET_GICOMBINER_CLASS_MEMBERS +#undef AMDGPUSubtarget }; -bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( - MachineInstr &MI, FMinFMaxLegacyInfo &Info) { +#define GET_GICOMBINER_IMPL +#define AMDGPUSubtarget GCNSubtarget +#include "AMDGPUGenPostLegalizeGICombiner.inc" +#undef AMDGPUSubtarget +#undef GET_GICOMBINER_IMPL + +AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl( + const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, + MachineIRBuilder &B, AMDGPUCombinerHelper &Helper, + GISelChangeObserver &Observer) + : RuleConfig(RuleConfig), B(B), MF(B.getMF()), MRI(*B.getMRI()), + STI(MF.getSubtarget<GCNSubtarget>()), TII(*STI.getInstrInfo()), + Helper(Helper), Observer(Observer), +#define GET_GICOMBINER_CONSTRUCTOR_INITS +#include "AMDGPUGenPostLegalizeGICombiner.inc" +#undef GET_GICOMBINER_CONSTRUCTOR_INITS +{ +} + +bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy( + MachineInstr &MI, FMinFMaxLegacyInfo &Info) const { // FIXME: Type predicate on pattern if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) return false; @@ -91,6 +147,8 @@ bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( Info.True = MI.getOperand(2).getReg(); Info.False = MI.getOperand(3).getReg(); + // TODO: Handle case where the the selected value is an fneg and the compared + // constant is the negation of the selected value. if (!(Info.LHS == Info.True && Info.RHS == Info.False) && !(Info.LHS == Info.False && Info.RHS == Info.True)) return false; @@ -110,8 +168,8 @@ bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( } } -void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy( - MachineInstr &MI, const FMinFMaxLegacyInfo &Info) { +void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinToFMaxLegacy( + MachineInstr &MI, const FMinFMaxLegacyInfo &Info) const { B.setInstrAndDebugLoc(MI); auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) { B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); @@ -159,7 +217,8 @@ void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy( MI.eraseFromParent(); } -bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) { +bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat( + MachineInstr &MI) const { Register DstReg = MI.getOperand(0).getReg(); // TODO: We could try to match extracting the higher bytes, which would be @@ -178,7 +237,8 @@ bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) { return false; } -void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) { +void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat( + MachineInstr &MI) const { B.setInstrAndDebugLoc(MI); const LLT S32 = LLT::scalar(32); @@ -191,19 +251,20 @@ void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) { SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); if (Ty == S32) { - B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, - {SrcReg}, MI.getFlags()); + B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg}, + MI.getFlags()); } else { - auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, - {SrcReg}, MI.getFlags()); + auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg}, + MI.getFlags()); B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); } MI.eraseFromParent(); } -bool AMDGPUPostLegalizerCombinerHelper::matchRcpSqrtToRsq( - MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { +bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq( + MachineInstr &MI, + std::function<void(MachineIRBuilder &)> &MatchInfo) const { auto getRcpSrc = [=](const MachineInstr &MI) { MachineInstr *ResMI = nullptr; @@ -246,8 +307,8 @@ bool AMDGPUPostLegalizerCombinerHelper::matchRcpSqrtToRsq( return false; } -bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN( - MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) { +bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN( + MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const { Register SrcReg = MI.getOperand(1).getReg(); // Look through G_ZEXT. @@ -274,8 +335,8 @@ bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN( return false; } -void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN( - MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) { +void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN( + MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) const { B.setInstrAndDebugLoc(MI); unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; @@ -292,57 +353,66 @@ void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN( MI.eraseFromParent(); } -bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize( - MachineInstr &MI, Register &Reg) { +bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize( + MachineInstr &MI, Register &Reg) const { const SITargetLowering *TLI = static_cast<const SITargetLowering *>( MF.getSubtarget().getTargetLowering()); Reg = MI.getOperand(1).getReg(); return TLI->isCanonicalized(Reg, MF); } -class AMDGPUPostLegalizerCombinerHelperState { -protected: - AMDGPUCombinerHelper &Helper; - AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper; +// The buffer_load_{i8, i16} intrinsics are intially lowered as buffer_load_{u8, +// u16} instructions. Here, the buffer_load_{u8, u16} instructions are combined +// with sign extension instrucions in order to generate buffer_load_{i8, i16} +// instructions. - // Note: pointer is necessary because Target Predicates use - // "Subtarget->" - const GCNSubtarget *Subtarget; +// Identify buffer_load_{u8, u16}. +bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg( + MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const { + Register Op0Reg = MI.getOperand(1).getReg(); + SubwordBufferLoad = MRI.getVRegDef(Op0Reg); -public: - AMDGPUPostLegalizerCombinerHelperState( - AMDGPUCombinerHelper &Helper, - AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper, - const GCNSubtarget &Subtarget) - : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper), - Subtarget(&Subtarget) {} -}; + if (!MRI.hasOneNonDBGUse(Op0Reg)) + return false; -#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS -#include "AMDGPUGenPostLegalizeGICombiner.inc" -#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS + // Check if the first operand of the sign extension is a subword buffer load + // instruction. + return SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE || + SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; +} -namespace { -#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H -#include "AMDGPUGenPostLegalizeGICombiner.inc" -#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H +// Combine buffer_load_{u8, u16} and the sign extension instruction to generate +// buffer_load_{i8, i16}. +void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg( + MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const { + // Modify the opcode and the destination of buffer_load_{u8, u16}: + // Replace the opcode. + unsigned Opc = + SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE + ? AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE + : AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT; + SubwordBufferLoad->setDesc(TII.get(Opc)); + // Update the destination register of SubwordBufferLoad with the destination + // register of the sign extension. + Register SignExtendInsnDst = MI.getOperand(0).getReg(); + SubwordBufferLoad->getOperand(0).setReg(SignExtendInsnDst); + // Remove the sign extension. + MI.eraseFromParent(); +} class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo { GISelKnownBits *KB; MachineDominatorTree *MDT; - const GCNSubtarget &Subtarget; + AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig; public: - AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; - - AMDGPUPostLegalizerCombinerInfo(const GCNSubtarget &Subtarget, bool EnableOpt, - bool OptSize, bool MinSize, + AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, const AMDGPULegalizerInfo *LI, GISelKnownBits *KB, MachineDominatorTree *MDT) : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), - KB(KB), MDT(MDT), Subtarget(Subtarget) { - if (!GeneratedRuleCfg.parseCommandLineOption()) + KB(KB), MDT(MDT) { + if (!RuleConfig.parseCommandLineOption()) report_fatal_error("Invalid rule identifier"); } @@ -355,11 +425,11 @@ bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, MachineIRBuilder &B) const { AMDGPUCombinerHelper Helper(Observer, B, /*IsPreLegalize*/ false, KB, MDT, LInfo); - AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper); - AMDGPUGenPostLegalizerCombinerHelper Generated( - GeneratedRuleCfg, Helper, PostLegalizerHelper, Subtarget); + // TODO: Do not re-create the Impl on every inst, it should be per function. + AMDGPUPostLegalizerCombinerImpl Impl(RuleConfig, B, Helper, Observer); + Impl.setupMF(*MI.getMF(), KB); - if (Generated.tryCombineAll(Observer, MI, B)) + if (Impl.tryCombineAll(MI)) return true; switch (MI.getOpcode()) { @@ -375,10 +445,6 @@ bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, return false; } -#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP -#include "AMDGPUGenPostLegalizeGICombiner.inc" -#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP - // Pass boilerplate // ================ @@ -414,7 +480,7 @@ void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { } AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) - : MachineFunctionPass(ID), IsOptNone(IsOptNone) { + : MachineFunctionPass(ID), IsOptNone(IsOptNone) { initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); } @@ -428,13 +494,13 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const AMDGPULegalizerInfo *LI - = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); + const AMDGPULegalizerInfo *LI = + static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); MachineDominatorTree *MDT = IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); - AMDGPUPostLegalizerCombinerInfo PCInfo(ST, EnableOpt, F.hasOptSize(), + AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), F.hasMinSize(), LI, KB, MDT); Combiner C(PCInfo, TPC); return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); @@ -442,8 +508,8 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { char AMDGPUPostLegalizerCombiner::ID = 0; INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, - "Combine AMDGPU machine instrs after legalization", - false, false) + "Combine AMDGPU machine instrs after legalization", false, + false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index a02d2cd302fb..936ca54fcf2e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -20,28 +20,48 @@ #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" +#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h" +#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Target/TargetMachine.h" +#define GET_GICOMBINER_DEPS +#include "AMDGPUGenPreLegalizeGICombiner.inc" +#undef GET_GICOMBINER_DEPS + #define DEBUG_TYPE "amdgpu-prelegalizer-combiner" using namespace llvm; using namespace MIPatternMatch; +namespace { + +#define GET_GICOMBINER_TYPES +#include "AMDGPUGenPreLegalizeGICombiner.inc" +#undef GET_GICOMBINER_TYPES -class AMDGPUPreLegalizerCombinerHelper { +class AMDGPUPreLegalizerCombinerImpl : public GIMatchTableExecutor { protected: + const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig; + const GCNSubtarget &STI; + + GISelChangeObserver &Observer; MachineIRBuilder &B; MachineFunction &MF; MachineRegisterInfo &MRI; AMDGPUCombinerHelper &Helper; public: - AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, - AMDGPUCombinerHelper &Helper) - : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; + AMDGPUPreLegalizerCombinerImpl( + const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig, + const GCNSubtarget &STI, GISelChangeObserver &Observer, + MachineIRBuilder &B, AMDGPUCombinerHelper &Helper); + + static const char *getName() { return "AMDGPUPreLegalizerCombinerImpl"; } + + bool tryCombineAll(MachineInstr &I) const; struct ClampI64ToI16MatchInfo { int64_t Cmp1 = 0; @@ -49,17 +69,42 @@ public: Register Origin; }; - bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineFunction &MF, - ClampI64ToI16MatchInfo &MatchInfo); + bool matchClampI64ToI16(MachineInstr &MI, const MachineRegisterInfo &MRI, + const MachineFunction &MF, + ClampI64ToI16MatchInfo &MatchInfo) const; void applyClampI64ToI16(MachineInstr &MI, - const ClampI64ToI16MatchInfo &MatchInfo); + const ClampI64ToI16MatchInfo &MatchInfo) const; + +private: +#define GET_GICOMBINER_CLASS_MEMBERS +#define AMDGPUSubtarget GCNSubtarget +#include "AMDGPUGenPreLegalizeGICombiner.inc" +#undef GET_GICOMBINER_CLASS_MEMBERS +#undef AMDGPUSubtarget }; -bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16( - MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF, - ClampI64ToI16MatchInfo &MatchInfo) { +#define GET_GICOMBINER_IMPL +#define AMDGPUSubtarget GCNSubtarget +#include "AMDGPUGenPreLegalizeGICombiner.inc" +#undef AMDGPUSubtarget +#undef GET_GICOMBINER_IMPL + +AMDGPUPreLegalizerCombinerImpl::AMDGPUPreLegalizerCombinerImpl( + const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig, + const GCNSubtarget &STI, GISelChangeObserver &Observer, MachineIRBuilder &B, + AMDGPUCombinerHelper &Helper) + : RuleConfig(RuleConfig), STI(STI), Observer(Observer), B(B), MF(B.getMF()), + MRI(*B.getMRI()), Helper(Helper), +#define GET_GICOMBINER_CONSTRUCTOR_INITS +#include "AMDGPUGenPreLegalizeGICombiner.inc" +#undef GET_GICOMBINER_CONSTRUCTOR_INITS +{ +} + +bool AMDGPUPreLegalizerCombinerImpl::matchClampI64ToI16( + MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineFunction &MF, + ClampI64ToI16MatchInfo &MatchInfo) const { assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!"); // Try to find a pattern where an i64 value should get clamped to short. @@ -118,8 +163,8 @@ bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16( // This can be efficiently written as following: // v_cvt_pk_i16_i32 v0, v0, v1 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max -void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16( - MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) { +void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16( + MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) const { Register Src = MatchInfo.Origin; assert(MI.getParent()->getParent()->getRegInfo().getType(Src) == @@ -154,40 +199,18 @@ void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16( MI.eraseFromParent(); } -class AMDGPUPreLegalizerCombinerHelperState { -protected: - AMDGPUCombinerHelper &Helper; - AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper; - -public: - AMDGPUPreLegalizerCombinerHelperState( - AMDGPUCombinerHelper &Helper, - AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper) - : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {} -}; - -#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS -#include "AMDGPUGenPreLegalizeGICombiner.inc" -#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS - -namespace { -#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H -#include "AMDGPUGenPreLegalizeGICombiner.inc" -#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H - class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo { GISelKnownBits *KB; MachineDominatorTree *MDT; + AMDGPUPreLegalizerCombinerImplRuleConfig RuleConfig; public: - AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; - AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, - GISelKnownBits *KB, MachineDominatorTree *MDT) + GISelKnownBits *KB, MachineDominatorTree *MDT) : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize), KB(KB), MDT(MDT) { - if (!GeneratedRuleCfg.parseCommandLineOption()) + if (!RuleConfig.parseCommandLineOption()) report_fatal_error("Invalid rule identifier"); } @@ -196,15 +219,17 @@ public: }; bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, - MachineInstr &MI, - MachineIRBuilder &B) const { + MachineInstr &MI, + MachineIRBuilder &B) const { const auto *LI = MI.getMF()->getSubtarget().getLegalizerInfo(); AMDGPUCombinerHelper Helper(Observer, B, /*IsPreLegalize*/ true, KB, MDT, LI); - AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper); - AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, - PreLegalizerHelper); - if (Generated.tryCombineAll(Observer, MI, B)) + const GCNSubtarget &STI = MI.getMF()->getSubtarget<GCNSubtarget>(); + // TODO: Do not re-create the Impl on every inst, it should be per function. + AMDGPUPreLegalizerCombinerImpl Impl(RuleConfig, STI, Observer, B, Helper); + Impl.setupMF(*MI.getMF(), KB); + + if (Impl.tryCombineAll(MI)) return true; switch (MI.getOpcode()) { @@ -217,10 +242,6 @@ bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, return false; } -#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP -#include "AMDGPUGenPreLegalizeGICombiner.inc" -#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP - // Pass boilerplate // ================ @@ -237,6 +258,7 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override; + private: bool IsOptNone; }; @@ -259,7 +281,7 @@ void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { } AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone) - : MachineFunctionPass(ID), IsOptNone(IsOptNone) { + : MachineFunctionPass(ID), IsOptNone(IsOptNone) { initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp index b24300923780..13f83e298cf4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -19,9 +19,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" -#include "llvm/ADT/Triple.h" -#include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" @@ -29,6 +27,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/InitializePasses.h" #include "llvm/Support/DataExtractor.h" +#include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; @@ -46,19 +45,11 @@ public: private: bool runOnModule(Module &M) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addRequired<DominatorTreeWrapperPass>(); - } }; class AMDGPUPrintfRuntimeBindingImpl { public: - AMDGPUPrintfRuntimeBindingImpl( - function_ref<const DominatorTree &(Function &)> GetDT, - function_ref<const TargetLibraryInfo &(Function &)> GetTLI) - : GetDT(GetDT), GetTLI(GetTLI) {} + AMDGPUPrintfRuntimeBindingImpl() {} bool run(Module &M); private: @@ -67,14 +58,7 @@ private: bool lowerPrintfForGpu(Module &M); - Value *simplify(Instruction *I, const TargetLibraryInfo *TLI, - const DominatorTree *DT) { - return simplifyInstruction(I, {*TD, TLI, DT}); - } - const DataLayout *TD; - function_ref<const DominatorTree &(Function &)> GetDT; - function_ref<const TargetLibraryInfo &(Function &)> GetTLI; SmallVector<CallInst *, 32> Printfs; }; } // namespace @@ -175,23 +159,6 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) { SmallString<16> OpConvSpecifiers; Value *Op = CI->getArgOperand(0); - if (auto LI = dyn_cast<LoadInst>(Op)) { - Op = LI->getPointerOperand(); - for (auto *Use : Op->users()) { - if (auto SI = dyn_cast<StoreInst>(Use)) { - Op = SI->getValueOperand(); - break; - } - } - } - - if (auto I = dyn_cast<Instruction>(Op)) { - Value *Op_simplified = - simplify(I, &GetTLI(*I->getFunction()), &GetDT(*I->getFunction())); - if (Op_simplified) - Op = Op_simplified; - } - StringRef FormatStr; if (!getConstantStringInfo(Op, FormatStr)) { Value *Stripped = Op->stripPointerCasts(); @@ -438,20 +405,15 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) { for (unsigned I = 0, E = WhatToStore.size(); I != E; ++I) { Value *TheBtCast = WhatToStore[I]; unsigned ArgSize = TD->getTypeAllocSize(TheBtCast->getType()); - SmallVector<Value *, 1> BuffOffset; - BuffOffset.push_back(ConstantInt::get(I32Ty, ArgSize)); - - Type *ArgPointer = PointerType::get(TheBtCast->getType(), 1); - Value *CastedGEP = - new BitCastInst(BufferIdx, ArgPointer, "PrintBuffPtrCast", Brnch); - StoreInst *StBuff = new StoreInst(TheBtCast, CastedGEP, Brnch); + StoreInst *StBuff = new StoreInst(TheBtCast, BufferIdx, Brnch); LLVM_DEBUG(dbgs() << "inserting store to printf buffer:\n" << *StBuff << '\n'); (void)StBuff; if (I + 1 == E && ArgCount + 1 == CI->arg_size()) break; - BufferIdx = GetElementPtrInst::Create(I8Ty, BufferIdx, BuffOffset, - "PrintBuffNextPtr", Brnch); + BufferIdx = GetElementPtrInst::Create( + I8Ty, BufferIdx, {ConstantInt::get(I32Ty, ArgSize)}, + "PrintBuffNextPtr", Brnch); LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:\n" << *BufferIdx << '\n'); } @@ -491,26 +453,11 @@ bool AMDGPUPrintfRuntimeBindingImpl::run(Module &M) { } bool AMDGPUPrintfRuntimeBinding::runOnModule(Module &M) { - auto GetDT = [this](Function &F) -> DominatorTree & { - return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(); - }; - auto GetTLI = [this](Function &F) -> TargetLibraryInfo & { - return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); - }; - - return AMDGPUPrintfRuntimeBindingImpl(GetDT, GetTLI).run(M); + return AMDGPUPrintfRuntimeBindingImpl().run(M); } PreservedAnalyses AMDGPUPrintfRuntimeBindingPass::run(Module &M, ModuleAnalysisManager &AM) { - FunctionAnalysisManager &FAM = - AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); - auto GetDT = [&FAM](Function &F) -> DominatorTree & { - return FAM.getResult<DominatorTreeAnalysis>(F); - }; - auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { - return FAM.getResult<TargetLibraryAnalysis>(F); - }; - bool Changed = AMDGPUPrintfRuntimeBindingImpl(GetDT, GetTLI).run(M); + bool Changed = AMDGPUPrintfRuntimeBindingImpl().run(M); return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index a7da4005e867..1d69f0434b58 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -6,23 +6,42 @@ // //===----------------------------------------------------------------------===// // -// This pass eliminates allocas by either converting them into vectors or -// by migrating them to local address space. +// Eliminates allocas by either converting them into vectors or by migrating +// them to local address space. +// +// Two passes are exposed by this file: +// - "promote-alloca-to-vector", which runs early in the pipeline and only +// promotes to vector. Promotion to vector is almost always profitable +// except when the alloca is too big and the promotion would result in +// very high register pressure. +// - "promote-alloca", which does both promotion to vector and LDS and runs +// much later in the pipeline. This runs after SROA because promoting to +// LDS is of course less profitable than getting rid of the alloca or +// vectorizing it, thus we only want to do it when the only alternative is +// lowering the alloca to stack. +// +// Note that both of them exist for the old and new PMs. The new PM passes are +// declared in AMDGPU.h and the legacy PM ones are declared here.s // //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/InstSimplifyFolder.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" #define DEBUG_TYPE "amdgpu-promote-alloca" @@ -30,40 +49,22 @@ using namespace llvm; namespace { -static cl::opt<bool> DisablePromoteAllocaToVector( - "disable-promote-alloca-to-vector", - cl::desc("Disable promote alloca to vector"), - cl::init(false)); +static cl::opt<bool> + DisablePromoteAllocaToVector("disable-promote-alloca-to-vector", + cl::desc("Disable promote alloca to vector"), + cl::init(false)); -static cl::opt<bool> DisablePromoteAllocaToLDS( - "disable-promote-alloca-to-lds", - cl::desc("Disable promote alloca to LDS"), - cl::init(false)); +static cl::opt<bool> + DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds", + cl::desc("Disable promote alloca to LDS"), + cl::init(false)); static cl::opt<unsigned> PromoteAllocaToVectorLimit( - "amdgpu-promote-alloca-to-vector-limit", - cl::desc("Maximum byte size to consider promote alloca to vector"), - cl::init(0)); - -// FIXME: This can create globals so should be a module pass. -class AMDGPUPromoteAlloca : public FunctionPass { -public: - static char ID; - - AMDGPUPromoteAlloca() : FunctionPass(ID) {} - - bool runOnFunction(Function &F) override; - - StringRef getPassName() const override { return "AMDGPU Promote Alloca"; } - - bool handleAlloca(AllocaInst &I, bool SufficientLDS); - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - FunctionPass::getAnalysisUsage(AU); - } -}; + "amdgpu-promote-alloca-to-vector-limit", + cl::desc("Maximum byte size to consider promote alloca to vector"), + cl::init(0)); +// Shared implementation which can do both promotion to vector and to LDS. class AMDGPUPromoteAllocaImpl { private: const TargetMachine &TM; @@ -83,26 +84,55 @@ private: /// BaseAlloca is the alloca root the search started from. /// Val may be that alloca or a recursive user of it. - bool collectUsesWithPtrTypes(Value *BaseAlloca, - Value *Val, - std::vector<Value*> &WorkList) const; + bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val, + std::vector<Value *> &WorkList) const; /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand /// indices to an instruction with 2 pointer inputs (e.g. select, icmp). /// Returns true if both operands are derived from the same alloca. Val should /// be the same value as one of the input operands of UseInst. bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val, - Instruction *UseInst, - int OpIdx0, int OpIdx1) const; + Instruction *UseInst, int OpIdx0, + int OpIdx1) const; /// Check whether we have enough local memory for promotion. bool hasSufficientLocalMem(const Function &F); - bool handleAlloca(AllocaInst &I, bool SufficientLDS); + bool tryPromoteAllocaToVector(AllocaInst &I); + bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS); public: - AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {} - bool run(Function &F); + AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) { + const Triple &TT = TM.getTargetTriple(); + IsAMDGCN = TT.getArch() == Triple::amdgcn; + IsAMDHSA = TT.getOS() == Triple::AMDHSA; + } + + bool run(Function &F, bool PromoteToLDS); +}; + +// FIXME: This can create globals so should be a module pass. +class AMDGPUPromoteAlloca : public FunctionPass { +public: + static char ID; + + AMDGPUPromoteAlloca() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) + return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>()) + .run(F, /*PromoteToLDS*/ true); + return false; + } + + StringRef getPassName() const override { return "AMDGPU Promote Alloca"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + FunctionPass::getAnalysisUsage(AU); + } }; class AMDGPUPromoteAllocaToVector : public FunctionPass { @@ -111,7 +141,14 @@ public: AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {} - bool runOnFunction(Function &F) override; + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) + return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>()) + .run(F, /*PromoteToLDS*/ false); + return false; + } StringRef getPassName() const override { return "AMDGPU Promote Alloca to vector"; @@ -123,6 +160,22 @@ public: } }; +unsigned getMaxVGPRs(const TargetMachine &TM, const Function &F) { + if (!TM.getTargetTriple().isAMDGCN()) + return 128; + + const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); + unsigned MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); + + // A non-entry function has only 32 caller preserved registers. + // Do not promote alloca which will force spilling unless we know the function + // will be inlined. + if (!F.hasFnAttribute(Attribute::AlwaysInline) && + !AMDGPU::isEntryFunctionCC(F.getCallingConv())) + MaxVGPRs = std::min(MaxVGPRs, 32u); + return MaxVGPRs; +} + } // end anonymous namespace char AMDGPUPromoteAlloca::ID = 0; @@ -142,19 +195,20 @@ INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector", char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID; char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID; -bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - - if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) { - return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>()).run(F); +PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F, + FunctionAnalysisManager &AM) { + bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ true); + if (Changed) { + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + return PA; } - return false; + return PreservedAnalyses::all(); } -PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F, - FunctionAnalysisManager &AM) { - bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F); +PreservedAnalyses +AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) { + bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ false); if (Changed) { PreservedAnalyses PA; PA.preserveSet<CFGAnalyses>(); @@ -163,175 +217,72 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F, return PreservedAnalyses::all(); } -bool AMDGPUPromoteAllocaImpl::run(Function &F) { +FunctionPass *llvm::createAMDGPUPromoteAlloca() { + return new AMDGPUPromoteAlloca(); +} + +FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() { + return new AMDGPUPromoteAllocaToVector(); +} + +bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { Mod = F.getParent(); DL = &Mod->getDataLayout(); - const Triple &TT = TM.getTargetTriple(); - IsAMDGCN = TT.getArch() == Triple::amdgcn; - IsAMDHSA = TT.getOS() == Triple::AMDHSA; - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); if (!ST.isPromoteAllocaEnabled()) return false; - if (IsAMDGCN) { - const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); - MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); - // A non-entry function has only 32 caller preserved registers. - // Do not promote alloca which will force spilling. - if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) - MaxVGPRs = std::min(MaxVGPRs, 32u); - } else { - MaxVGPRs = 128; - } + MaxVGPRs = getMaxVGPRs(TM, F); - bool SufficientLDS = hasSufficientLocalMem(F); - bool Changed = false; - BasicBlock &EntryBB = *F.begin(); + bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false; SmallVector<AllocaInst *, 16> Allocas; - for (Instruction &I : EntryBB) { - if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) + for (Instruction &I : F.getEntryBlock()) { + if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) { + // Array allocations are probably not worth handling, since an allocation + // of the array type is the canonical form. + if (!AI->isStaticAlloca() || AI->isArrayAllocation()) + continue; Allocas.push_back(AI); + } } + bool Changed = false; for (AllocaInst *AI : Allocas) { - if (handleAlloca(*AI, SufficientLDS)) + if (tryPromoteAllocaToVector(*AI)) + Changed = true; + else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS)) Changed = true; } + // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains + // dangling pointers. If we want to reuse it past this point, the loop above + // would need to be updated to remove successfully promoted allocas. + return Changed; } -std::pair<Value *, Value *> -AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) { - Function &F = *Builder.GetInsertBlock()->getParent(); - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); - - if (!IsAMDHSA) { - Function *LocalSizeYFn - = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y); - Function *LocalSizeZFn - = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z); - - CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {}); - CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {}); - - ST.makeLIDRangeMetadata(LocalSizeY); - ST.makeLIDRangeMetadata(LocalSizeZ); - - return std::pair(LocalSizeY, LocalSizeZ); - } - - // We must read the size out of the dispatch pointer. - assert(IsAMDGCN); +struct MemTransferInfo { + ConstantInt *SrcIndex = nullptr; + ConstantInt *DestIndex = nullptr; +}; - // We are indexing into this struct, and want to extract the workgroup_size_* - // fields. - // - // typedef struct hsa_kernel_dispatch_packet_s { - // uint16_t header; - // uint16_t setup; - // uint16_t workgroup_size_x ; - // uint16_t workgroup_size_y; - // uint16_t workgroup_size_z; - // uint16_t reserved0; - // uint32_t grid_size_x ; - // uint32_t grid_size_y ; - // uint32_t grid_size_z; - // - // uint32_t private_segment_size; - // uint32_t group_segment_size; - // uint64_t kernel_object; +// Checks if the instruction I is a memset user of the alloca AI that we can +// deal with. Currently, only non-volatile memsets that affect the whole alloca +// are handled. +static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI, + const DataLayout &DL) { + using namespace PatternMatch; + // For now we only care about non-volatile memsets that affect the whole type + // (start at index 0 and fill the whole alloca). // - // #ifdef HSA_LARGE_MODEL - // void *kernarg_address; - // #elif defined HSA_LITTLE_ENDIAN - // void *kernarg_address; - // uint32_t reserved1; - // #else - // uint32_t reserved1; - // void *kernarg_address; - // #endif - // uint64_t reserved2; - // hsa_signal_t completion_signal; // uint64_t wrapper - // } hsa_kernel_dispatch_packet_t - // - Function *DispatchPtrFn - = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr); - - CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {}); - DispatchPtr->addRetAttr(Attribute::NoAlias); - DispatchPtr->addRetAttr(Attribute::NonNull); - F.removeFnAttr("amdgpu-no-dispatch-ptr"); - - // Size of the dispatch packet struct. - DispatchPtr->addDereferenceableRetAttr(64); - - Type *I32Ty = Type::getInt32Ty(Mod->getContext()); - Value *CastDispatchPtr = Builder.CreateBitCast( - DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS)); - - // We could do a single 64-bit load here, but it's likely that the basic - // 32-bit and extract sequence is already present, and it is probably easier - // to CSE this. The loads should be mergeable later anyway. - Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1); - LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4)); - - Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2); - LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4)); - - MDNode *MD = MDNode::get(Mod->getContext(), std::nullopt); - LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD); - LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD); - ST.makeLIDRangeMetadata(LoadZU); - - // Extract y component. Upper half of LoadZU should be zero already. - Value *Y = Builder.CreateLShr(LoadXY, 16); - - return std::pair(Y, LoadZU); -} - -Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder, - unsigned N) { - Function *F = Builder.GetInsertBlock()->getParent(); - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *F); - Intrinsic::ID IntrID = Intrinsic::not_intrinsic; - StringRef AttrName; - - switch (N) { - case 0: - IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x - : (Intrinsic::ID)Intrinsic::r600_read_tidig_x; - AttrName = "amdgpu-no-workitem-id-x"; - break; - case 1: - IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y - : (Intrinsic::ID)Intrinsic::r600_read_tidig_y; - AttrName = "amdgpu-no-workitem-id-y"; - break; - - case 2: - IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z - : (Intrinsic::ID)Intrinsic::r600_read_tidig_z; - AttrName = "amdgpu-no-workitem-id-z"; - break; - default: - llvm_unreachable("invalid dimension"); - } - - Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID); - CallInst *CI = Builder.CreateCall(WorkitemIdFn); - ST.makeLIDRangeMetadata(CI); - F->removeFnAttr(AttrName); - - return CI; -} - -static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) { - return FixedVectorType::get(ArrayTy->getElementType(), - ArrayTy->getNumElements()); + // TODO: Now that we moved to PromoteAlloca we could handle any memsets + // (except maybe volatile ones?) - we just need to use shufflevector if it + // only affects a subset of the vector. + const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType()); + return I->getOperand(0) == AI && + match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile(); } static Value * @@ -379,60 +330,336 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, return ConstantInt::get(GEP->getContext(), Quot); } -struct MemTransferInfo { - ConstantInt *SrcIndex = nullptr; - ConstantInt *DestIndex = nullptr; -}; +/// Promotes a single user of the alloca to a vector form. +/// +/// \param Inst Instruction to be promoted. +/// \param DL Module Data Layout. +/// \param VectorTy Vectorized Type. +/// \param VecStoreSize Size of \p VectorTy in bytes. +/// \param ElementSize Size of \p VectorTy element type in bytes. +/// \param TransferInfo MemTransferInst info map. +/// \param GEPVectorIdx GEP -> VectorIdx cache. +/// \param CurVal Current value of the vector (e.g. last stored value) +/// \param[out] DeferredLoads \p Inst is added to this vector if it can't +/// be promoted now. This happens when promoting requires \p +/// CurVal, but \p CurVal is nullptr. +/// \return the stored value if \p Inst would have written to the alloca, or +/// nullptr otherwise. +static Value *promoteAllocaUserToVector( + Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy, + unsigned VecStoreSize, unsigned ElementSize, + DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo, + std::map<GetElementPtrInst *, Value *> &GEPVectorIdx, Value *CurVal, + SmallVectorImpl<LoadInst *> &DeferredLoads) { + // Note: we use InstSimplifyFolder because it can leverage the DataLayout + // to do more folding, especially in the case of vector splats. + IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(), + InstSimplifyFolder(DL)); + Builder.SetInsertPoint(Inst); + + const auto GetOrLoadCurrentVectorValue = [&]() -> Value * { + if (CurVal) + return CurVal; + + // If the current value is not known, insert a dummy load and lower it on + // the second pass. + LoadInst *Dummy = + Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()), + "promotealloca.dummyload"); + DeferredLoads.push_back(Dummy); + return Dummy; + }; + + const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val, + Type *PtrTy) -> Value * { + assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy)); + const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy); + if (!PtrTy->isVectorTy()) + return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size)); + const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements(); + // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to + // first cast the ptr vector to <2 x i64>. + assert((Size % NumPtrElts == 0) && "Vector size not divisble"); + Type *EltTy = Builder.getIntNTy(Size / NumPtrElts); + return Builder.CreateBitOrPointerCast( + Val, FixedVectorType::get(EltTy, NumPtrElts)); + }; + + Type *VecEltTy = VectorTy->getElementType(); + switch (Inst->getOpcode()) { + case Instruction::Load: { + // Loads can only be lowered if the value is known. + if (!CurVal) { + DeferredLoads.push_back(cast<LoadInst>(Inst)); + return nullptr; + } + + Value *Index = calculateVectorIndex( + cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx); + + // We're loading the full vector. + Type *AccessTy = Inst->getType(); + TypeSize AccessSize = DL.getTypeStoreSize(AccessTy); + if (AccessSize == VecStoreSize && cast<Constant>(Index)->isZeroValue()) { + if (AccessTy->isPtrOrPtrVectorTy()) + CurVal = CreateTempPtrIntCast(CurVal, AccessTy); + else if (CurVal->getType()->isPtrOrPtrVectorTy()) + CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType()); + Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy); + Inst->replaceAllUsesWith(NewVal); + return nullptr; + } -static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, - unsigned MaxVGPRs) { + // Loading a subvector. + if (isa<FixedVectorType>(AccessTy)) { + assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy))); + const unsigned NumElts = AccessSize / DL.getTypeStoreSize(VecEltTy); + auto *SubVecTy = FixedVectorType::get(VecEltTy, NumElts); + assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy)); + + unsigned IndexVal = cast<ConstantInt>(Index)->getZExtValue(); + Value *SubVec = PoisonValue::get(SubVecTy); + for (unsigned K = 0; K < NumElts; ++K) { + SubVec = Builder.CreateInsertElement( + SubVec, Builder.CreateExtractElement(CurVal, IndexVal + K), K); + } + + if (AccessTy->isPtrOrPtrVectorTy()) + SubVec = CreateTempPtrIntCast(SubVec, AccessTy); + else if (SubVecTy->isPtrOrPtrVectorTy()) + SubVec = CreateTempPtrIntCast(SubVec, SubVecTy); + + SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy); + Inst->replaceAllUsesWith(SubVec); + return nullptr; + } + + // We're loading one element. + Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index); + if (AccessTy != VecEltTy) + ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy); + + Inst->replaceAllUsesWith(ExtractElement); + return nullptr; + } + case Instruction::Store: { + // For stores, it's a bit trickier and it depends on whether we're storing + // the full vector or not. If we're storing the full vector, we don't need + // to know the current value. If this is a store of a single element, we + // need to know the value. + StoreInst *SI = cast<StoreInst>(Inst); + Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx); + Value *Val = SI->getValueOperand(); + + // We're storing the full vector, we can handle this without knowing CurVal. + Type *AccessTy = Val->getType(); + TypeSize AccessSize = DL.getTypeStoreSize(AccessTy); + if (AccessSize == VecStoreSize && cast<Constant>(Index)->isZeroValue()) { + if (AccessTy->isPtrOrPtrVectorTy()) + Val = CreateTempPtrIntCast(Val, AccessTy); + else if (VectorTy->isPtrOrPtrVectorTy()) + Val = CreateTempPtrIntCast(Val, VectorTy); + return Builder.CreateBitOrPointerCast(Val, VectorTy); + } + + // Storing a subvector. + if (isa<FixedVectorType>(AccessTy)) { + assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy))); + const unsigned NumElts = AccessSize / DL.getTypeStoreSize(VecEltTy); + auto *SubVecTy = FixedVectorType::get(VecEltTy, NumElts); + assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy)); + + if (SubVecTy->isPtrOrPtrVectorTy()) + Val = CreateTempPtrIntCast(Val, SubVecTy); + else if (AccessTy->isPtrOrPtrVectorTy()) + Val = CreateTempPtrIntCast(Val, AccessTy); + + Val = Builder.CreateBitOrPointerCast(Val, SubVecTy); + + unsigned IndexVal = cast<ConstantInt>(Index)->getZExtValue(); + Value *CurVec = GetOrLoadCurrentVectorValue(); + for (unsigned K = 0; (IndexVal + K) < NumElts; ++K) { + CurVec = Builder.CreateInsertElement( + CurVec, Builder.CreateExtractElement(Val, K), IndexVal + K); + } + return CurVec; + } + + if (Val->getType() != VecEltTy) + Val = Builder.CreateBitOrPointerCast(Val, VecEltTy); + return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val, + Index); + } + case Instruction::Call: { + if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) { + // For memcpy, we need to know curval. + ConstantInt *Length = cast<ConstantInt>(MTI->getLength()); + unsigned NumCopied = Length->getZExtValue() / ElementSize; + MemTransferInfo *TI = &TransferInfo[MTI]; + unsigned SrcBegin = TI->SrcIndex->getZExtValue(); + unsigned DestBegin = TI->DestIndex->getZExtValue(); + + SmallVector<int> Mask; + for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) { + if (Idx >= DestBegin && Idx < DestBegin + NumCopied) { + Mask.push_back(SrcBegin++); + } else { + Mask.push_back(Idx); + } + } + + return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask); + } + + if (auto *MSI = dyn_cast<MemSetInst>(Inst)) { + // For memset, we don't need to know the previous value because we + // currently only allow memsets that cover the whole alloca. + Value *Elt = MSI->getOperand(1); + if (DL.getTypeStoreSize(VecEltTy) > 1) { + Value *EltBytes = + Builder.CreateVectorSplat(DL.getTypeStoreSize(VecEltTy), Elt); + Elt = Builder.CreateBitCast(EltBytes, VecEltTy); + } + + return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt); + } + + llvm_unreachable("Unsupported call when promoting alloca to vector"); + } + + default: + llvm_unreachable("Inconsistency in instructions promotable to vector"); + } + + llvm_unreachable("Did not return after promoting instruction!"); +} + +static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy, + const DataLayout &DL) { + // Access as a vector type can work if the size of the access vector is a + // multiple of the size of the alloca's vector element type. + // + // Examples: + // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK + // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK + // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK + // - 3*32 is not a multiple of 64 + // + // We could handle more complicated cases, but it'd make things a lot more + // complicated. + if (isa<FixedVectorType>(AccessTy)) { + TypeSize AccTS = DL.getTypeStoreSize(AccessTy); + TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType()); + return AccTS.isKnownMultipleOf(VecTS); + } + + return CastInst::isBitOrNoopPointerCastable(VecTy->getElementType(), AccessTy, + DL); +} + +/// Iterates over an instruction worklist that may contain multiple instructions +/// from the same basic block, but in a different order. +template <typename InstContainer> +static void forEachWorkListItem(const InstContainer &WorkList, + std::function<void(Instruction *)> Fn) { + // Bucket up uses of the alloca by the block they occur in. + // This is important because we have to handle multiple defs/uses in a block + // ourselves: SSAUpdater is purely for cross-block references. + DenseMap<BasicBlock *, SmallDenseSet<Instruction *>> UsesByBlock; + for (Instruction *User : WorkList) + UsesByBlock[User->getParent()].insert(User); + + for (Instruction *User : WorkList) { + BasicBlock *BB = User->getParent(); + auto &BlockUses = UsesByBlock[BB]; + + // Already processed, skip. + if (BlockUses.empty()) + continue; + + // Only user in the block, directly process it. + if (BlockUses.size() == 1) { + Fn(User); + continue; + } + + // Multiple users in the block, do a linear scan to see users in order. + for (Instruction &Inst : *BB) { + if (!BlockUses.contains(&Inst)) + continue; + + Fn(&Inst); + } + + // Clear the block so we know it's been processed. + BlockUses.clear(); + } +} + +// FIXME: Should try to pick the most likely to be profitable allocas first. +bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { + LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n'); if (DisablePromoteAllocaToVector) { - LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n"); + LLVM_DEBUG(dbgs() << " Promote alloca to vector is disabled\n"); return false; } - Type *AllocaTy = Alloca->getAllocatedType(); + Type *AllocaTy = Alloca.getAllocatedType(); auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy); if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) { if (VectorType::isValidElementType(ArrayTy->getElementType()) && ArrayTy->getNumElements() > 0) - VectorTy = arrayTypeToVecType(ArrayTy); + VectorTy = FixedVectorType::get(ArrayTy->getElementType(), + ArrayTy->getNumElements()); } // Use up to 1/4 of available register budget for vectorization. unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8 : (MaxVGPRs * 32); - if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) { - LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with " - << MaxVGPRs << " registers available\n"); + if (DL->getTypeSizeInBits(AllocaTy) * 4 > Limit) { + LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with " << MaxVGPRs + << " registers available\n"); return false; } - LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n"); - // FIXME: There is no reason why we can't support larger arrays, we // are just being conservative for now. - // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these - // could also be promoted but we don't currently handle this case - if (!VectorTy || VectorTy->getNumElements() > 16 || - VectorTy->getNumElements() < 2) { + // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or + // equivalent. Potentially these could also be promoted but we don't currently + // handle this case + if (!VectorTy) { LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n"); return false; } - std::map<GetElementPtrInst*, Value*> GEPVectorIdx; + if (VectorTy->getNumElements() > 16 || VectorTy->getNumElements() < 2) { + LLVM_DEBUG(dbgs() << " " << *VectorTy + << " has an unsupported number of elements\n"); + return false; + } + + std::map<GetElementPtrInst *, Value *> GEPVectorIdx; SmallVector<Instruction *> WorkList; + SmallVector<Instruction *> UsersToRemove; SmallVector<Instruction *> DeferredInsts; SmallVector<Use *, 8> Uses; DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo; - for (Use &U : Alloca->uses()) + const auto RejectUser = [&](Instruction *Inst, Twine Msg) { + LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n" + << " " << *Inst << "\n"); + return false; + }; + + for (Use &U : Alloca.uses()) Uses.push_back(&U); + LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n"); + Type *VecEltTy = VectorTy->getElementType(); - unsigned ElementSize = DL.getTypeSizeInBits(VecEltTy) / 8; + unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8; while (!Uses.empty()) { Use *U = Uses.pop_back_val(); Instruction *Inst = cast<Instruction>(U->getUser()); @@ -441,22 +668,29 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, // This is a store of the pointer, not to the pointer. if (isa<StoreInst>(Inst) && U->getOperandNo() != StoreInst::getPointerOperandIndex()) - return false; + return RejectUser(Inst, "pointer is being stored"); Type *AccessTy = getLoadStoreType(Inst); + if (AccessTy->isAggregateType()) + return RejectUser(Inst, "unsupported load/store as aggregate"); + assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy()); + Ptr = Ptr->stripPointerCasts(); - // Alloca already accessed as vector, leave alone. - if (Ptr == Alloca && DL.getTypeStoreSize(Alloca->getAllocatedType()) == - DL.getTypeStoreSize(AccessTy)) + // Alloca already accessed as vector. + if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) == + DL->getTypeStoreSize(AccessTy)) { + WorkList.push_back(Inst); continue; + } // Check that this is a simple access of a vector element. bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple() : cast<StoreInst>(Inst)->isSimple(); - if (!IsSimple || - !CastInst::isBitOrNoopPointerCastable(VecEltTy, AccessTy, DL)) - return false; + if (!IsSimple) + return RejectUser(Inst, "not a simple load or store"); + if (!isSupportedAccessType(VectorTy, AccessTy, *DL)) + return RejectUser(Inst, "not a supported access type"); WorkList.push_back(Inst); continue; @@ -466,32 +700,38 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, // Look through bitcasts. for (Use &U : Inst->uses()) Uses.push_back(&U); + UsersToRemove.push_back(Inst); continue; } if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) { // If we can't compute a vector index from this GEP, then we can't // promote this alloca to vector. - Value *Index = GEPToVectorIndex(GEP, Alloca, VecEltTy, DL); - if (!Index) { - LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP - << '\n'); - return false; - } + Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL); + if (!Index) + return RejectUser(Inst, "cannot compute vector index for GEP"); GEPVectorIdx[GEP] = Index; for (Use &U : Inst->uses()) Uses.push_back(&U); + UsersToRemove.push_back(Inst); + continue; + } + + if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst); + MSI && isSupportedMemset(MSI, &Alloca, *DL)) { + WorkList.push_back(Inst); continue; } if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) { if (TransferInst->isVolatile()) - return false; + return RejectUser(Inst, "mem transfer inst is volatile"); ConstantInt *Len = dyn_cast<ConstantInt>(TransferInst->getLength()); - if (!Len || !!(Len->getZExtValue() % ElementSize)) - return false; + if (!Len || (Len->getZExtValue() % ElementSize)) + return RejectUser(Inst, "mem transfer inst length is non-constant or " + "not a multiple of the vector element size"); if (!TransferInfo.count(TransferInst)) { DeferredInsts.push_back(Inst); @@ -501,7 +741,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * { GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); - if (Ptr != Alloca && !GEPVectorIdx.count(GEP)) + if (Ptr != &Alloca && !GEPVectorIdx.count(GEP)) return nullptr; return dyn_cast<ConstantInt>(calculateVectorIndex(Ptr, GEPVectorIdx)); @@ -513,30 +753,33 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, Value *Dest = TransferInst->getDest(); ConstantInt *Index = getPointerIndexOfAlloca(Dest); if (!Index) - return false; + return RejectUser(Inst, "could not calculate constant dest index"); TI->DestIndex = Index; } else { assert(OpNum == 1); Value *Src = TransferInst->getSource(); ConstantInt *Index = getPointerIndexOfAlloca(Src); if (!Index) - return false; + return RejectUser(Inst, "could not calculate constant src index"); TI->SrcIndex = Index; } continue; } // Ignore assume-like intrinsics and comparisons used in assumes. - if (isAssumeLikeIntrinsic(Inst)) + if (isAssumeLikeIntrinsic(Inst)) { + UsersToRemove.push_back(Inst); continue; + } if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) { return isAssumeLikeIntrinsic(cast<Instruction>(U)); - })) + })) { + UsersToRemove.push_back(Inst); continue; + } - // Unknown user. - return false; + return RejectUser(Inst, "unhandled alloca user"); } while (!DeferredInsts.empty()) { @@ -546,82 +789,194 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, // from different address spaces. MemTransferInfo &Info = TransferInfo[TransferInst]; if (!Info.SrcIndex || !Info.DestIndex) - return false; + return RejectUser( + Inst, "mem transfer inst is missing constant src and/or dst index"); } LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " << *VectorTy << '\n'); + const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy); - for (Instruction *Inst : WorkList) { - IRBuilder<> Builder(Inst); - switch (Inst->getOpcode()) { - case Instruction::Load: { - Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand(); - Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace()); - Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); - Value *VecValue = - Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca->getAlign()); - Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); - if (Inst->getType() != VecEltTy) - ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType()); - Inst->replaceAllUsesWith(ExtractElement); - Inst->eraseFromParent(); - break; - } - case Instruction::Store: { - StoreInst *SI = cast<StoreInst>(Inst); - Value *Ptr = SI->getPointerOperand(); - Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); - Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace()); - Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); - Value *VecValue = - Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca->getAlign()); - Value *Elt = SI->getValueOperand(); - if (Elt->getType() != VecEltTy) - Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy); - Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index); - Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca->getAlign()); - Inst->eraseFromParent(); - break; - } - case Instruction::Call: { - if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst)) { - ConstantInt *Length = cast<ConstantInt>(MTI->getLength()); - unsigned NumCopied = Length->getZExtValue() / ElementSize; - MemTransferInfo *TI = &TransferInfo[cast<MemTransferInst>(Inst)]; - unsigned SrcBegin = TI->SrcIndex->getZExtValue(); - unsigned DestBegin = TI->DestIndex->getZExtValue(); + // Alloca is uninitialized memory. Imitate that by making the first value + // undef. + SSAUpdater Updater; + Updater.Initialize(VectorTy, "promotealloca"); + Updater.AddAvailableValue(Alloca.getParent(), UndefValue::get(VectorTy)); - SmallVector<int> Mask; - for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) { - if (Idx >= DestBegin && Idx < DestBegin + NumCopied) { - Mask.push_back(SrcBegin++); - } else { - Mask.push_back(Idx); - } - } - Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace()); - Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); - Value *VecValue = - Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca->getAlign()); - Value *NewVecValue = Builder.CreateShuffleVector(VecValue, Mask); - Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca->getAlign()); + // First handle the initial worklist. + SmallVector<LoadInst *, 4> DeferredLoads; + forEachWorkListItem(WorkList, [&](Instruction *I) { + BasicBlock *BB = I->getParent(); + // On the first pass, we only take values that are trivially known, i.e. + // where AddAvailableValue was already called in this block. + Value *Result = promoteAllocaUserToVector( + I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx, + Updater.FindValueForBlock(BB), DeferredLoads); + if (Result) + Updater.AddAvailableValue(BB, Result); + }); - Inst->eraseFromParent(); - } else { - llvm_unreachable("Unsupported call when promoting alloca to vector"); - } - break; - } + // Then handle deferred loads. + forEachWorkListItem(DeferredLoads, [&](Instruction *I) { + SmallVector<LoadInst *, 0> NewDLs; + BasicBlock *BB = I->getParent(); + // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always + // get a value, inserting PHIs as needed. + Value *Result = promoteAllocaUserToVector( + I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx, + Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs); + if (Result) + Updater.AddAvailableValue(BB, Result); + assert(NewDLs.empty() && "No more deferred loads should be queued!"); + }); - default: - llvm_unreachable("Inconsistency in instructions promotable to vector"); - } + // Delete all instructions. On the first pass, new dummy loads may have been + // added so we need to collect them too. + DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end()); + InstsToDelete.insert(DeferredLoads.begin(), DeferredLoads.end()); + for (Instruction *I : InstsToDelete) { + assert(I->use_empty()); + I->eraseFromParent(); + } + + // Delete all the users that are known to be removeable. + for (Instruction *I : reverse(UsersToRemove)) { + I->dropDroppableUses(); + assert(I->use_empty()); + I->eraseFromParent(); } + + // Alloca should now be dead too. + assert(Alloca.use_empty()); + Alloca.eraseFromParent(); return true; } +std::pair<Value *, Value *> +AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) { + Function &F = *Builder.GetInsertBlock()->getParent(); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); + + if (!IsAMDHSA) { + Function *LocalSizeYFn = + Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y); + Function *LocalSizeZFn = + Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z); + + CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {}); + CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {}); + + ST.makeLIDRangeMetadata(LocalSizeY); + ST.makeLIDRangeMetadata(LocalSizeZ); + + return std::pair(LocalSizeY, LocalSizeZ); + } + + // We must read the size out of the dispatch pointer. + assert(IsAMDGCN); + + // We are indexing into this struct, and want to extract the workgroup_size_* + // fields. + // + // typedef struct hsa_kernel_dispatch_packet_s { + // uint16_t header; + // uint16_t setup; + // uint16_t workgroup_size_x ; + // uint16_t workgroup_size_y; + // uint16_t workgroup_size_z; + // uint16_t reserved0; + // uint32_t grid_size_x ; + // uint32_t grid_size_y ; + // uint32_t grid_size_z; + // + // uint32_t private_segment_size; + // uint32_t group_segment_size; + // uint64_t kernel_object; + // + // #ifdef HSA_LARGE_MODEL + // void *kernarg_address; + // #elif defined HSA_LITTLE_ENDIAN + // void *kernarg_address; + // uint32_t reserved1; + // #else + // uint32_t reserved1; + // void *kernarg_address; + // #endif + // uint64_t reserved2; + // hsa_signal_t completion_signal; // uint64_t wrapper + // } hsa_kernel_dispatch_packet_t + // + Function *DispatchPtrFn = + Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr); + + CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {}); + DispatchPtr->addRetAttr(Attribute::NoAlias); + DispatchPtr->addRetAttr(Attribute::NonNull); + F.removeFnAttr("amdgpu-no-dispatch-ptr"); + + // Size of the dispatch packet struct. + DispatchPtr->addDereferenceableRetAttr(64); + + Type *I32Ty = Type::getInt32Ty(Mod->getContext()); + Value *CastDispatchPtr = Builder.CreateBitCast( + DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS)); + + // We could do a single 64-bit load here, but it's likely that the basic + // 32-bit and extract sequence is already present, and it is probably easier + // to CSE this. The loads should be mergeable later anyway. + Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1); + LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4)); + + Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2); + LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4)); + + MDNode *MD = MDNode::get(Mod->getContext(), std::nullopt); + LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD); + LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD); + ST.makeLIDRangeMetadata(LoadZU); + + // Extract y component. Upper half of LoadZU should be zero already. + Value *Y = Builder.CreateLShr(LoadXY, 16); + + return std::pair(Y, LoadZU); +} + +Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder, + unsigned N) { + Function *F = Builder.GetInsertBlock()->getParent(); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *F); + Intrinsic::ID IntrID = Intrinsic::not_intrinsic; + StringRef AttrName; + + switch (N) { + case 0: + IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x + : (Intrinsic::ID)Intrinsic::r600_read_tidig_x; + AttrName = "amdgpu-no-workitem-id-x"; + break; + case 1: + IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y + : (Intrinsic::ID)Intrinsic::r600_read_tidig_y; + AttrName = "amdgpu-no-workitem-id-y"; + break; + + case 2: + IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z + : (Intrinsic::ID)Intrinsic::r600_read_tidig_z; + AttrName = "amdgpu-no-workitem-id-z"; + break; + default: + llvm_unreachable("invalid dimension"); + } + + Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID); + CallInst *CI = Builder.CreateCall(WorkitemIdFn); + ST.makeLIDRangeMetadata(CI); + F->removeFnAttr(AttrName); + + return CI; +} + static bool isCallPromotable(CallInst *CI) { IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); if (!II) @@ -883,8 +1238,8 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { CurrentLocalMemUsage += Alloc.first; } - unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, - F); + unsigned MaxOccupancy = + ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, F); // Restrict local memory usage so that we don't drastically reduce occupancy, // unless it is already significantly reduced. @@ -902,10 +1257,9 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { // usage. MaxOccupancy = std::min(OccupancyHint, MaxOccupancy); - // Round up to the next tier of usage. - unsigned MaxSizeWithWaveCount - = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); + unsigned MaxSizeWithWaveCount = + ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F); // Program is possibly broken by using more local mem than available. if (CurrentLocalMemUsage > MaxSizeWithWaveCount) @@ -924,26 +1278,18 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { } // FIXME: Should try to pick the most likely to be profitable allocas first. -bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { - // Array allocations are probably not worth handling, since an allocation of - // the array type is the canonical form. - if (!I.isStaticAlloca() || I.isArrayAllocation()) +bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, + bool SufficientLDS) { + LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n'); + + if (DisablePromoteAllocaToLDS) { + LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n"); return false; + } const DataLayout &DL = Mod->getDataLayout(); IRBuilder<> Builder(&I); - // First try to replace the alloca with a vector - Type *AllocaTy = I.getAllocatedType(); - - LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n'); - - if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs)) - return true; // Promoted to vector. - - if (DisablePromoteAllocaToLDS) - return false; - const Function &ContainingFunction = *I.getParent()->getParent(); CallingConv::ID CC = ContainingFunction.getCallingConv(); @@ -978,7 +1324,8 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { // could end up using more than the maximum due to alignment padding. uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment); - uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy); + uint32_t AllocSize = + WorkGroupSize * DL.getTypeAllocSize(I.getAllocatedType()); NewSize += AllocSize; if (NewSize > LocalMemLimit) { @@ -989,7 +1336,7 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { CurrentLocalMemUsage = NewSize; - std::vector<Value*> WorkList; + std::vector<Value *> WorkList; if (!collectUsesWithPtrTypes(&I, &I, WorkList)) { LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n"); @@ -1021,10 +1368,8 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { Value *TID = Builder.CreateAdd(Tmp0, Tmp1); TID = Builder.CreateAdd(TID, TIdZ); - Value *Indices[] = { - Constant::getNullValue(Type::getInt32Ty(Mod->getContext())), - TID - }; + LLVMContext &Context = Mod->getContext(); + Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(Context)), TID}; Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices); I.mutateType(Offset->getType()); @@ -1037,9 +1382,7 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { CallInst *Call = dyn_cast<CallInst>(V); if (!Call) { if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) { - Value *Src0 = CI->getOperand(0); - PointerType *NewTy = PointerType::getWithSamePointeeType( - cast<PointerType>(Src0->getType()), AMDGPUAS::LOCAL_ADDRESS); + PointerType *NewTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS); if (isa<ConstantPointerNull>(CI->getOperand(0))) CI->setOperand(0, ConstantPointerNull::get(NewTy)); @@ -1055,8 +1398,7 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { if (isa<AddrSpaceCastInst>(V)) continue; - PointerType *NewTy = PointerType::getWithSamePointeeType( - cast<PointerType>(V->getType()), AMDGPUAS::LOCAL_ADDRESS); + PointerType *NewTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS); // FIXME: It doesn't really make sense to try to do this for all // instructions. @@ -1116,8 +1458,7 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { Function *ObjectSize = Intrinsic::getDeclaration( Mod, Intrinsic::objectsize, {Intr->getType(), - PointerType::getWithSamePointeeType( - cast<PointerType>(Src->getType()), AMDGPUAS::LOCAL_ADDRESS)}); + PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS)}); CallInst *NewCall = Builder.CreateCall( ObjectSize, @@ -1138,10 +1479,9 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove); MemTransferInst *MI = cast<MemTransferInst>(Intr); - auto *B = - Builder.CreateMemTransferInst(ID, MI->getRawDest(), MI->getDestAlign(), - MI->getRawSource(), MI->getSourceAlign(), - MI->getLength(), MI->isVolatile()); + auto *B = Builder.CreateMemTransferInst( + ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(), + MI->getSourceAlign(), MI->getLength(), MI->isVolatile()); for (unsigned I = 0; I != 2; ++I) { if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) { @@ -1154,80 +1494,3 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { return true; } - -bool handlePromoteAllocaToVector(AllocaInst &I, unsigned MaxVGPRs) { - // Array allocations are probably not worth handling, since an allocation of - // the array type is the canonical form. - if (!I.isStaticAlloca() || I.isArrayAllocation()) - return false; - - LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n'); - - Module *Mod = I.getParent()->getParent()->getParent(); - return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs); -} - -bool promoteAllocasToVector(Function &F, TargetMachine &TM) { - if (DisablePromoteAllocaToVector) - return false; - - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); - if (!ST.isPromoteAllocaEnabled()) - return false; - - unsigned MaxVGPRs; - if (TM.getTargetTriple().getArch() == Triple::amdgcn) { - const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); - MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); - // A non-entry function has only 32 caller preserved registers. - // Do not promote alloca which will force spilling. - if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) - MaxVGPRs = std::min(MaxVGPRs, 32u); - } else { - MaxVGPRs = 128; - } - - bool Changed = false; - BasicBlock &EntryBB = *F.begin(); - - SmallVector<AllocaInst *, 16> Allocas; - for (Instruction &I : EntryBB) { - if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) - Allocas.push_back(AI); - } - - for (AllocaInst *AI : Allocas) { - if (handlePromoteAllocaToVector(*AI, MaxVGPRs)) - Changed = true; - } - - return Changed; -} - -bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) { - return promoteAllocasToVector(F, TPC->getTM<TargetMachine>()); - } - return false; -} - -PreservedAnalyses -AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) { - bool Changed = promoteAllocasToVector(F, TM); - if (Changed) { - PreservedAnalyses PA; - PA.preserveSet<CFGAnalyses>(); - return PA; - } - return PreservedAnalyses::all(); -} - -FunctionPass *llvm::createAMDGPUPromoteAlloca() { - return new AMDGPUPromoteAlloca(); -} - -FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() { - return new AMDGPUPromoteAllocaToVector(); -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp index ed450f59e4b3..9b654a2bba7f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp @@ -116,7 +116,7 @@ bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) { // Cast pointer to global address space and back to flat and let // Infer Address Spaces pass to do all necessary rewriting. PointerType *NewPT = - PointerType::getWithSamePointeeType(PT, AMDGPUAS::GLOBAL_ADDRESS); + PointerType::get(PT->getContext(), AMDGPUAS::GLOBAL_ADDRESS); Value *Cast = B.CreateAddrSpaceCast(Ptr, NewPT, Twine(Ptr->getName(), ".global")); Value *CastBack = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp deleted file mode 100644 index 5a4ab467731e..000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp +++ /dev/null @@ -1,426 +0,0 @@ -//===--- AMDGPUPropagateAttributes.cpp --------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief This pass propagates attributes from kernels to the non-entry -/// functions. Most of the library functions were not compiled for specific ABI, -/// yet will be correctly compiled if proper attributes are propagated from the -/// caller. -/// -/// The pass analyzes call graph and propagates ABI target features through the -/// call graph. -/// -/// It can run in two modes: as a function or module pass. A function pass -/// simply propagates attributes. A module pass clones functions if there are -/// callers with different ABI. If a function is cloned all call sites will -/// be updated to use a correct clone. -/// -/// A function pass is limited in functionality but can run early in the -/// pipeline. A module pass is more powerful but has to run late, so misses -/// library folding opportunities. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/CodeGen/TargetSubtargetInfo.h" -#include "llvm/IR/InstrTypes.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Transforms/Utils/Cloning.h" - -#define DEBUG_TYPE "amdgpu-propagate-attributes" - -using namespace llvm; - -namespace llvm { -extern const SubtargetFeatureKV AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures-1]; -} - -namespace { - -// Target features to propagate. -static constexpr const FeatureBitset TargetFeatures = { - AMDGPU::FeatureWavefrontSize16, - AMDGPU::FeatureWavefrontSize32, - AMDGPU::FeatureWavefrontSize64 -}; - -// Attributes to propagate. -// TODO: Support conservative min/max merging instead of cloning. -static constexpr const char *AttributeNames[] = {"amdgpu-waves-per-eu"}; - -static constexpr unsigned NumAttr = std::size(AttributeNames); - -class AMDGPUPropagateAttributes { - - class FnProperties { - private: - explicit FnProperties(const FeatureBitset &&FB) : Features(FB) {} - - public: - explicit FnProperties(const TargetMachine &TM, const Function &F) { - Features = TM.getSubtargetImpl(F)->getFeatureBits(); - - for (unsigned I = 0; I < NumAttr; ++I) - if (F.hasFnAttribute(AttributeNames[I])) - Attributes[I] = F.getFnAttribute(AttributeNames[I]); - } - - bool operator == (const FnProperties &Other) const { - if ((Features & TargetFeatures) != (Other.Features & TargetFeatures)) - return false; - for (unsigned I = 0; I < NumAttr; ++I) - if (Attributes[I] != Other.Attributes[I]) - return false; - return true; - } - - FnProperties adjustToCaller(const FnProperties &CallerProps) const { - FnProperties New((Features & ~TargetFeatures) | CallerProps.Features); - for (unsigned I = 0; I < NumAttr; ++I) - New.Attributes[I] = CallerProps.Attributes[I]; - return New; - } - - FeatureBitset Features; - std::optional<Attribute> Attributes[NumAttr]; - }; - - class Clone { - public: - Clone(const FnProperties &Props, Function *OrigF, Function *NewF) : - Properties(Props), OrigF(OrigF), NewF(NewF) {} - - FnProperties Properties; - Function *OrigF; - Function *NewF; - }; - - const TargetMachine *TM; - - // Clone functions as needed or just set attributes. - bool AllowClone; - - // Option propagation roots. - SmallSet<Function *, 32> Roots; - - // Clones of functions with their attributes. - SmallVector<Clone, 32> Clones; - - // Find a clone with required features. - Function *findFunction(const FnProperties &PropsNeeded, - Function *OrigF); - - // Clone function \p F and set \p NewProps on the clone. - // Cole takes the name of original function. - Function *cloneWithProperties(Function &F, const FnProperties &NewProps); - - // Set new function's features in place. - void setFeatures(Function &F, const FeatureBitset &NewFeatures); - - // Set new function's attributes in place. - void setAttributes(Function &F, - const ArrayRef<std::optional<Attribute>> NewAttrs); - - std::string getFeatureString(const FeatureBitset &Features) const; - - // Propagate attributes from Roots. - bool process(); - -public: - AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) : - TM(TM), AllowClone(AllowClone) {} - - // Use F as a root and propagate its attributes. - bool process(Function &F); - - // Propagate attributes starting from kernel functions. - bool process(Module &M); -}; - -// Allows to propagate attributes early, but no cloning is allowed as it must -// be a function pass to run before any optimizations. -// TODO: We shall only need a one instance of module pass, but that needs to be -// in the linker pipeline which is currently not possible. -class AMDGPUPropagateAttributesEarly : public FunctionPass { - const TargetMachine *TM; - -public: - static char ID; // Pass identification - - AMDGPUPropagateAttributesEarly(const TargetMachine *TM = nullptr) : - FunctionPass(ID), TM(TM) { - initializeAMDGPUPropagateAttributesEarlyPass( - *PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override; -}; - -// Allows to propagate attributes with cloning but does that late in the -// pipeline. -class AMDGPUPropagateAttributesLate : public ModulePass { - const TargetMachine *TM; - -public: - static char ID; // Pass identification - - AMDGPUPropagateAttributesLate(const TargetMachine *TM = nullptr) : - ModulePass(ID), TM(TM) { - initializeAMDGPUPropagateAttributesLatePass( - *PassRegistry::getPassRegistry()); - } - - bool runOnModule(Module &M) override; -}; - -} // end anonymous namespace. - -char AMDGPUPropagateAttributesEarly::ID = 0; -char AMDGPUPropagateAttributesLate::ID = 0; - -INITIALIZE_PASS(AMDGPUPropagateAttributesEarly, - "amdgpu-propagate-attributes-early", - "Early propagate attributes from kernels to functions", - false, false) -INITIALIZE_PASS(AMDGPUPropagateAttributesLate, - "amdgpu-propagate-attributes-late", - "Late propagate attributes from kernels to functions", - false, false) - -Function * -AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded, - Function *OrigF) { - // TODO: search for clone's clones. - for (Clone &C : Clones) - if (C.OrigF == OrigF && PropsNeeded == C.Properties) - return C.NewF; - - return nullptr; -} - -bool AMDGPUPropagateAttributes::process(Module &M) { - for (auto &F : M.functions()) - if (AMDGPU::isKernel(F.getCallingConv())) - Roots.insert(&F); - - return Roots.empty() ? false : process(); -} - -bool AMDGPUPropagateAttributes::process(Function &F) { - Roots.insert(&F); - return process(); -} - -bool AMDGPUPropagateAttributes::process() { - bool Changed = false; - SmallSet<Function *, 32> NewRoots; - SmallSet<Function *, 32> Replaced; - - assert(!Roots.empty()); - Module &M = *(*Roots.begin())->getParent(); - - do { - Roots.insert(NewRoots.begin(), NewRoots.end()); - NewRoots.clear(); - - for (auto &F : M.functions()) { - if (F.isDeclaration()) - continue; - - const FnProperties CalleeProps(*TM, F); - SmallVector<std::pair<CallBase *, Function *>, 32> ToReplace; - SmallSet<CallBase *, 32> Visited; - - for (User *U : F.users()) { - Instruction *I = dyn_cast<Instruction>(U); - if (!I) - continue; - CallBase *CI = dyn_cast<CallBase>(I); - // Only propagate attributes if F is the called function. Specifically, - // do not propagate attributes if F is passed as an argument. - // FIXME: handle bitcasted callee, e.g. - // %retval = call i8* bitcast (i32* ()* @f to i8* ()*)() - if (!CI || CI->getCalledOperand() != &F) - continue; - Function *Caller = CI->getCaller(); - if (!Caller || !Visited.insert(CI).second) - continue; - if (!Roots.count(Caller) && !NewRoots.count(Caller)) - continue; - - const FnProperties CallerProps(*TM, *Caller); - - if (CalleeProps == CallerProps) { - if (!Roots.count(&F)) - NewRoots.insert(&F); - continue; - } - - Function *NewF = findFunction(CallerProps, &F); - if (!NewF) { - const FnProperties NewProps = CalleeProps.adjustToCaller(CallerProps); - if (!AllowClone) { - // This may set different features on different iterations if - // there is a contradiction in callers' attributes. In this case - // we rely on a second pass running on Module, which is allowed - // to clone. - setFeatures(F, NewProps.Features); - setAttributes(F, NewProps.Attributes); - NewRoots.insert(&F); - Changed = true; - break; - } - - NewF = cloneWithProperties(F, NewProps); - Clones.push_back(Clone(CallerProps, &F, NewF)); - NewRoots.insert(NewF); - } - - ToReplace.push_back(std::pair(CI, NewF)); - Replaced.insert(&F); - - Changed = true; - } - - while (!ToReplace.empty()) { - auto R = ToReplace.pop_back_val(); - R.first->setCalledFunction(R.second); - } - } - } while (!NewRoots.empty()); - - for (Function *F : Replaced) { - if (F->use_empty()) - F->eraseFromParent(); - } - - Roots.clear(); - Clones.clear(); - - return Changed; -} - -Function * -AMDGPUPropagateAttributes::cloneWithProperties(Function &F, - const FnProperties &NewProps) { - LLVM_DEBUG(dbgs() << "Cloning " << F.getName() << '\n'); - - ValueToValueMapTy dummy; - Function *NewF = CloneFunction(&F, dummy); - setFeatures(*NewF, NewProps.Features); - setAttributes(*NewF, NewProps.Attributes); - NewF->setVisibility(GlobalValue::DefaultVisibility); - NewF->setLinkage(GlobalValue::InternalLinkage); - - // Swap names. If that is the only clone it will retain the name of now - // dead value. Preserve original name for externally visible functions. - if (F.hasName() && F.hasLocalLinkage()) { - std::string NewName = std::string(NewF->getName()); - NewF->takeName(&F); - F.setName(NewName); - } - - return NewF; -} - -void AMDGPUPropagateAttributes::setFeatures(Function &F, - const FeatureBitset &NewFeatures) { - std::string NewFeatureStr = getFeatureString(NewFeatures); - - LLVM_DEBUG(dbgs() << "Set features " - << getFeatureString(NewFeatures & TargetFeatures) - << " on " << F.getName() << '\n'); - - F.removeFnAttr("target-features"); - F.addFnAttr("target-features", NewFeatureStr); -} - -void AMDGPUPropagateAttributes::setAttributes( - Function &F, const ArrayRef<std::optional<Attribute>> NewAttrs) { - LLVM_DEBUG(dbgs() << "Set attributes on " << F.getName() << ":\n"); - for (unsigned I = 0; I < NumAttr; ++I) { - F.removeFnAttr(AttributeNames[I]); - if (NewAttrs[I]) { - LLVM_DEBUG(dbgs() << '\t' << NewAttrs[I]->getAsString() << '\n'); - F.addFnAttr(*NewAttrs[I]); - } - } -} - -std::string -AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const -{ - std::string Ret; - for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV) { - if (Features[KV.Value]) - Ret += (StringRef("+") + KV.Key + ",").str(); - else if (TargetFeatures[KV.Value]) - Ret += (StringRef("-") + KV.Key + ",").str(); - } - Ret.pop_back(); // Remove last comma. - return Ret; -} - -bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) { - if (!TM) { - auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); - if (!TPC) - return false; - - TM = &TPC->getTM<TargetMachine>(); - } - - if (!AMDGPU::isKernel(F.getCallingConv())) - return false; - - return AMDGPUPropagateAttributes(TM, false).process(F); -} - -bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) { - if (!TM) { - auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); - if (!TPC) - return false; - - TM = &TPC->getTM<TargetMachine>(); - } - - return AMDGPUPropagateAttributes(TM, true).process(M); -} - -FunctionPass -*llvm::createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *TM) { - return new AMDGPUPropagateAttributesEarly(TM); -} - -ModulePass -*llvm::createAMDGPUPropagateAttributesLatePass(const TargetMachine *TM) { - return new AMDGPUPropagateAttributesLate(TM); -} - -PreservedAnalyses -AMDGPUPropagateAttributesEarlyPass::run(Function &F, - FunctionAnalysisManager &AM) { - if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) - return PreservedAnalyses::all(); - - return AMDGPUPropagateAttributes(&TM, false).process(F) - ? PreservedAnalyses::none() - : PreservedAnalyses::all(); -} - -PreservedAnalyses -AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &AM) { - return AMDGPUPropagateAttributes(&TM, true).process(M) - ? PreservedAnalyses::none() - : PreservedAnalyses::all(); -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index b4315950b225..c935e384da8e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -20,37 +20,55 @@ #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" +#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h" +#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Target/TargetMachine.h" + +#define GET_GICOMBINER_DEPS +#include "AMDGPUGenPreLegalizeGICombiner.inc" +#undef GET_GICOMBINER_DEPS + #define DEBUG_TYPE "amdgpu-regbank-combiner" using namespace llvm; using namespace MIPatternMatch; -class AMDGPURegBankCombinerHelper { +namespace { +#define GET_GICOMBINER_TYPES +#include "AMDGPUGenRegBankGICombiner.inc" +#undef GET_GICOMBINER_TYPES + +class AMDGPURegBankCombinerImpl : public GIMatchTableExecutor { protected: + const AMDGPURegBankCombinerImplRuleConfig &RuleConfig; + MachineIRBuilder &B; MachineFunction &MF; MachineRegisterInfo &MRI; - const GCNSubtarget &Subtarget; + const GCNSubtarget &STI; const RegisterBankInfo &RBI; const TargetRegisterInfo &TRI; const SIInstrInfo &TII; CombinerHelper &Helper; + GISelChangeObserver &Observer; public: - AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) - : B(B), MF(B.getMF()), MRI(*B.getMRI()), - Subtarget(MF.getSubtarget<GCNSubtarget>()), - RBI(*Subtarget.getRegBankInfo()), TRI(*Subtarget.getRegisterInfo()), - TII(*Subtarget.getInstrInfo()), Helper(Helper){}; + AMDGPURegBankCombinerImpl( + const AMDGPURegBankCombinerImplRuleConfig &RuleConfig, + MachineIRBuilder &B, CombinerHelper &Helper, + GISelChangeObserver &Observer); + + static const char *getName() { return "AMDGPURegBankCombinerImpl"; } + + bool tryCombineAll(MachineInstr &I) const; - bool isVgprRegBank(Register Reg); - Register getAsVgpr(Register Reg); + bool isVgprRegBank(Register Reg) const; + Register getAsVgpr(Register Reg) const; struct MinMaxMedOpc { unsigned Min, Max, Med; @@ -61,33 +79,58 @@ public: Register Val0, Val1, Val2; }; - MinMaxMedOpc getMinMaxPair(unsigned Opc); + MinMaxMedOpc getMinMaxPair(unsigned Opc) const; template <class m_Cst, typename CstTy> bool matchMed(MachineInstr &MI, MachineRegisterInfo &MRI, MinMaxMedOpc MMMOpc, - Register &Val, CstTy &K0, CstTy &K1); + Register &Val, CstTy &K0, CstTy &K1) const; - bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); - bool matchFPMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); - bool matchFPMinMaxToClamp(MachineInstr &MI, Register &Reg); - bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg); - void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); - void applyClamp(MachineInstr &MI, Register &Reg); + bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const; + bool matchFPMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const; + bool matchFPMinMaxToClamp(MachineInstr &MI, Register &Reg) const; + bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg) const; + void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const; + void applyClamp(MachineInstr &MI, Register &Reg) const; private: - AMDGPU::SIModeRegisterDefaults getMode(); - bool getIEEE(); - bool getDX10Clamp(); - bool isFminnumIeee(const MachineInstr &MI); - bool isFCst(MachineInstr *MI); - bool isClampZeroToOne(MachineInstr *K0, MachineInstr *K1); + SIModeRegisterDefaults getMode() const; + bool getIEEE() const; + bool getDX10Clamp() const; + bool isFminnumIeee(const MachineInstr &MI) const; + bool isFCst(MachineInstr *MI) const; + bool isClampZeroToOne(MachineInstr *K0, MachineInstr *K1) const; + +#define GET_GICOMBINER_CLASS_MEMBERS +#define AMDGPUSubtarget GCNSubtarget +#include "AMDGPUGenRegBankGICombiner.inc" +#undef GET_GICOMBINER_CLASS_MEMBERS +#undef AMDGPUSubtarget }; -bool AMDGPURegBankCombinerHelper::isVgprRegBank(Register Reg) { +#define GET_GICOMBINER_IMPL +#define AMDGPUSubtarget GCNSubtarget +#include "AMDGPUGenRegBankGICombiner.inc" +#undef AMDGPUSubtarget +#undef GET_GICOMBINER_IMPL + +AMDGPURegBankCombinerImpl::AMDGPURegBankCombinerImpl( + const AMDGPURegBankCombinerImplRuleConfig &RuleConfig, MachineIRBuilder &B, + CombinerHelper &Helper, GISelChangeObserver &Observer) + : RuleConfig(RuleConfig), B(B), MF(B.getMF()), MRI(*B.getMRI()), + STI(MF.getSubtarget<GCNSubtarget>()), RBI(*STI.getRegBankInfo()), + TRI(*STI.getRegisterInfo()), TII(*STI.getInstrInfo()), Helper(Helper), + Observer(Observer), +#define GET_GICOMBINER_CONSTRUCTOR_INITS +#include "AMDGPUGenRegBankGICombiner.inc" +#undef GET_GICOMBINER_CONSTRUCTOR_INITS +{ +} + +bool AMDGPURegBankCombinerImpl::isVgprRegBank(Register Reg) const { return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::VGPRRegBankID; } -Register AMDGPURegBankCombinerHelper::getAsVgpr(Register Reg) { +Register AMDGPURegBankCombinerImpl::getAsVgpr(Register Reg) const { if (isVgprRegBank(Reg)) return Reg; @@ -104,8 +147,8 @@ Register AMDGPURegBankCombinerHelper::getAsVgpr(Register Reg) { return VgprReg; } -AMDGPURegBankCombinerHelper::MinMaxMedOpc -AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) { +AMDGPURegBankCombinerImpl::MinMaxMedOpc +AMDGPURegBankCombinerImpl::getMinMaxPair(unsigned Opc) const { switch (Opc) { default: llvm_unreachable("Unsupported opcode"); @@ -126,10 +169,10 @@ AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) { } template <class m_Cst, typename CstTy> -bool AMDGPURegBankCombinerHelper::matchMed(MachineInstr &MI, - MachineRegisterInfo &MRI, - MinMaxMedOpc MMMOpc, Register &Val, - CstTy &K0, CstTy &K1) { +bool AMDGPURegBankCombinerImpl::matchMed(MachineInstr &MI, + MachineRegisterInfo &MRI, + MinMaxMedOpc MMMOpc, Register &Val, + CstTy &K0, CstTy &K1) const { // 4 operand commutes of: min(max(Val, K0), K1). // Find K1 from outer instr: min(max(...), K1) or min(K1, max(...)). // Find K0 and Val from inner instr: max(K0, Val) or max(Val, K0). @@ -147,16 +190,15 @@ bool AMDGPURegBankCombinerHelper::matchMed(MachineInstr &MI, m_Cst(K0)))); } -bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3( - MachineInstr &MI, Med3MatchInfo &MatchInfo) { +bool AMDGPURegBankCombinerImpl::matchIntMinMaxToMed3( + MachineInstr &MI, Med3MatchInfo &MatchInfo) const { Register Dst = MI.getOperand(0).getReg(); if (!isVgprRegBank(Dst)) return false; // med3 for i16 is only available on gfx9+, and not available for v2i16. LLT Ty = MRI.getType(Dst); - if ((Ty != LLT::scalar(16) || !Subtarget.hasMed3_16()) && - Ty != LLT::scalar(32)) + if ((Ty != LLT::scalar(16) || !STI.hasMed3_16()) && Ty != LLT::scalar(32)) return false; MinMaxMedOpc OpcodeTriple = getMinMaxPair(MI.getOpcode()); @@ -193,14 +235,13 @@ bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3( // fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) = min(K0, K1) = K0 // min(max(NaN, K0), K1) = min(K0, K1) = K0 (can clamp when dx10_clamp = true) // max(min(NaN, K1), K0) = max(K1, K0) = K1 != K0 -bool AMDGPURegBankCombinerHelper::matchFPMinMaxToMed3( - MachineInstr &MI, Med3MatchInfo &MatchInfo) { +bool AMDGPURegBankCombinerImpl::matchFPMinMaxToMed3( + MachineInstr &MI, Med3MatchInfo &MatchInfo) const { Register Dst = MI.getOperand(0).getReg(); LLT Ty = MRI.getType(Dst); // med3 for f16 is only available on gfx9+, and not available for v2f16. - if ((Ty != LLT::scalar(16) || !Subtarget.hasMed3_16()) && - Ty != LLT::scalar(32)) + if ((Ty != LLT::scalar(16) || !STI.hasMed3_16()) && Ty != LLT::scalar(32)) return false; auto OpcodeTriple = getMinMaxPair(MI.getOpcode()); @@ -233,8 +274,8 @@ bool AMDGPURegBankCombinerHelper::matchFPMinMaxToMed3( return false; } -bool AMDGPURegBankCombinerHelper::matchFPMinMaxToClamp(MachineInstr &MI, - Register &Reg) { +bool AMDGPURegBankCombinerImpl::matchFPMinMaxToClamp(MachineInstr &MI, + Register &Reg) const { // Clamp is available on all types after regbankselect (f16, f32, f64, v2f16). auto OpcodeTriple = getMinMaxPair(MI.getOpcode()); Register Val; @@ -269,16 +310,13 @@ bool AMDGPURegBankCombinerHelper::matchFPMinMaxToClamp(MachineInstr &MI, // min(min(NaN, 0.0), 1.0) = min(0.0, 1.0) = 0.0 // min(min(NaN, 1.0), 0.0) = min(1.0, 0.0) = 0.0 // min(min(0.0, 1.0), NaN) = min(0.0, NaN) = 0.0 -bool AMDGPURegBankCombinerHelper::matchFPMed3ToClamp(MachineInstr &MI, - Register &Reg) { - if (MI.getIntrinsicID() != Intrinsic::amdgcn_fmed3) - return false; - +bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI, + Register &Reg) const { // In llvm-ir, clamp is often represented as an intrinsic call to // @llvm.amdgcn.fmed3.f32(%Val, 0.0, 1.0). Check for other operand orders. - MachineInstr *Src0 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI); - MachineInstr *Src1 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI); - MachineInstr *Src2 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI); + MachineInstr *Src0 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI); + MachineInstr *Src1 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI); + MachineInstr *Src2 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI); if (isFCst(Src0) && !isFCst(Src1)) std::swap(Src0, Src1); @@ -311,15 +349,16 @@ bool AMDGPURegBankCombinerHelper::matchFPMed3ToClamp(MachineInstr &MI, return false; } -void AMDGPURegBankCombinerHelper::applyClamp(MachineInstr &MI, Register &Reg) { +void AMDGPURegBankCombinerImpl::applyClamp(MachineInstr &MI, + Register &Reg) const { B.setInstrAndDebugLoc(MI); B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg}, MI.getFlags()); MI.eraseFromParent(); } -void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI, - Med3MatchInfo &MatchInfo) { +void AMDGPURegBankCombinerImpl::applyMed3(MachineInstr &MI, + Med3MatchInfo &MatchInfo) const { B.setInstrAndDebugLoc(MI); B.buildInstr(MatchInfo.Opc, {MI.getOperand(0)}, {getAsVgpr(MatchInfo.Val0), getAsVgpr(MatchInfo.Val1), @@ -328,24 +367,26 @@ void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI, MI.eraseFromParent(); } -AMDGPU::SIModeRegisterDefaults AMDGPURegBankCombinerHelper::getMode() { +SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const { return MF.getInfo<SIMachineFunctionInfo>()->getMode(); } -bool AMDGPURegBankCombinerHelper::getIEEE() { return getMode().IEEE; } +bool AMDGPURegBankCombinerImpl::getIEEE() const { return getMode().IEEE; } -bool AMDGPURegBankCombinerHelper::getDX10Clamp() { return getMode().DX10Clamp; } +bool AMDGPURegBankCombinerImpl::getDX10Clamp() const { + return getMode().DX10Clamp; +} -bool AMDGPURegBankCombinerHelper::isFminnumIeee(const MachineInstr &MI) { +bool AMDGPURegBankCombinerImpl::isFminnumIeee(const MachineInstr &MI) const { return MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE; } -bool AMDGPURegBankCombinerHelper::isFCst(MachineInstr *MI) { +bool AMDGPURegBankCombinerImpl::isFCst(MachineInstr *MI) const { return MI->getOpcode() == AMDGPU::G_FCONSTANT; } -bool AMDGPURegBankCombinerHelper::isClampZeroToOne(MachineInstr *K0, - MachineInstr *K1) { +bool AMDGPURegBankCombinerImpl::isClampZeroToOne(MachineInstr *K0, + MachineInstr *K1) const { if (isFCst(K0) && isFCst(K1)) { const ConstantFP *KO_FPImm = K0->getOperand(1).getFPImm(); const ConstantFP *K1_FPImm = K1->getOperand(1).getFPImm(); @@ -355,40 +396,19 @@ bool AMDGPURegBankCombinerHelper::isClampZeroToOne(MachineInstr *K0, return false; } -class AMDGPURegBankCombinerHelperState { -protected: - CombinerHelper &Helper; - AMDGPURegBankCombinerHelper &RegBankHelper; - -public: - AMDGPURegBankCombinerHelperState(CombinerHelper &Helper, - AMDGPURegBankCombinerHelper &RegBankHelper) - : Helper(Helper), RegBankHelper(RegBankHelper) {} -}; - -#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS -#include "AMDGPUGenRegBankGICombiner.inc" -#undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS - -namespace { -#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H -#include "AMDGPUGenRegBankGICombiner.inc" -#undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H - class AMDGPURegBankCombinerInfo final : public CombinerInfo { GISelKnownBits *KB; MachineDominatorTree *MDT; + AMDGPURegBankCombinerImplRuleConfig RuleConfig; public: - AMDGPUGenRegBankCombinerHelperRuleConfig GeneratedRuleCfg; - AMDGPURegBankCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, - const AMDGPULegalizerInfo *LI, - GISelKnownBits *KB, MachineDominatorTree *MDT) + const AMDGPULegalizerInfo *LI, GISelKnownBits *KB, + MachineDominatorTree *MDT) : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), KB(KB), MDT(MDT) { - if (!GeneratedRuleCfg.parseCommandLineOption()) + if (!RuleConfig.parseCommandLineOption()) report_fatal_error("Invalid rule identifier"); } @@ -397,23 +417,15 @@ public: }; bool AMDGPURegBankCombinerInfo::combine(GISelChangeObserver &Observer, - MachineInstr &MI, - MachineIRBuilder &B) const { + MachineInstr &MI, + MachineIRBuilder &B) const { CombinerHelper Helper(Observer, B, /* IsPreLegalize*/ false, KB, MDT); - AMDGPURegBankCombinerHelper RegBankHelper(B, Helper); - AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg, Helper, - RegBankHelper); - - if (Generated.tryCombineAll(Observer, MI, B)) - return true; - - return false; + // TODO: Do not re-create the Impl on every inst, it should be per function. + AMDGPURegBankCombinerImpl Impl(RuleConfig, B, Helper, Observer); + Impl.setupMF(*MI.getMF(), KB); + return Impl.tryCombineAll(MI); } -#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_CPP -#include "AMDGPUGenRegBankGICombiner.inc" -#undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_CPP - // Pass boilerplate // ================ @@ -423,9 +435,7 @@ public: AMDGPURegBankCombiner(bool IsOptNone = false); - StringRef getPassName() const override { - return "AMDGPURegBankCombiner"; - } + StringRef getPassName() const override { return "AMDGPURegBankCombiner"; } bool runOnMachineFunction(MachineFunction &MF) override; @@ -449,7 +459,7 @@ void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const { } AMDGPURegBankCombiner::AMDGPURegBankCombiner(bool IsOptNone) - : MachineFunctionPass(ID), IsOptNone(IsOptNone) { + : MachineFunctionPass(ID), IsOptNone(IsOptNone) { initializeAMDGPURegBankCombinerPass(*PassRegistry::getPassRegistry()); } @@ -463,14 +473,14 @@ bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) { MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const AMDGPULegalizerInfo *LI - = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); + const AMDGPULegalizerInfo *LI = + static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); MachineDominatorTree *MDT = IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); - AMDGPURegBankCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), - F.hasMinSize(), LI, KB, MDT); + AMDGPURegBankCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), F.hasMinSize(), + LI, KB, MDT); Combiner C(PCInfo, TPC); return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp new file mode 100644 index 000000000000..2ea03ddb1fcc --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp @@ -0,0 +1,77 @@ +//===- AMDGPURegBankSelect.cpp -----------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Use MachineUniformityAnalysis as the primary basis for making SGPR vs. VGPR +// register bank selection. Use/def analysis as in the default RegBankSelect can +// be useful in narrower circumstances (e.g. choosing AGPR vs. VGPR for gfx908). +// +//===----------------------------------------------------------------------===// + +#include "AMDGPURegBankSelect.h" +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "llvm/CodeGen/MachineUniformityAnalysis.h" +#include "llvm/InitializePasses.h" + +#define DEBUG_TYPE "regbankselect" + +using namespace llvm; + +AMDGPURegBankSelect::AMDGPURegBankSelect(Mode RunningMode) + : RegBankSelect(AMDGPURegBankSelect::ID, RunningMode) {} + +char AMDGPURegBankSelect::ID = 0; + +StringRef AMDGPURegBankSelect::getPassName() const { + return "AMDGPURegBankSelect"; +} + +void AMDGPURegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<MachineCycleInfoWrapperPass>(); + AU.addRequired<MachineDominatorTree>(); + // TODO: Preserve DomTree + RegBankSelect::getAnalysisUsage(AU); +} + +INITIALIZE_PASS_BEGIN(AMDGPURegBankSelect, "amdgpu-" DEBUG_TYPE, + "AMDGPU Register Bank Select", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(AMDGPURegBankSelect, "amdgpu-" DEBUG_TYPE, + "AMDGPU Register Bank Select", false, false) + +bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) { + // If the ISel pipeline failed, do not bother running that pass. + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::FailedISel)) + return false; + + LLVM_DEBUG(dbgs() << "Assign register banks for: " << MF.getName() << '\n'); + const Function &F = MF.getFunction(); + Mode SaveOptMode = OptMode; + if (F.hasOptNone()) + OptMode = Mode::Fast; + init(MF); + + assert(checkFunctionIsLegal(MF)); + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + MachineCycleInfo &CycleInfo = + getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo(); + MachineDominatorTree &DomTree = getAnalysis<MachineDominatorTree>(); + + MachineUniformityInfo Uniformity = + computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(), + !ST.isSingleLaneExecution(F)); + (void)Uniformity; // TODO: Use this + + assignRegisterBanks(MF); + + OptMode = SaveOptMode; + return false; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.h new file mode 100644 index 000000000000..83e4a6b41da1 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.h @@ -0,0 +1,29 @@ +//===- AMDGPURegBankSelect.h -------------------------------------*- C++ -*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGBANKSELECT_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGBANKSELECT_H + +#include "llvm/CodeGen/GlobalISel/RegBankSelect.h" + +namespace llvm { + +class AMDGPURegBankSelect final : public RegBankSelect { +public: + static char ID; + + AMDGPURegBankSelect(Mode RunningMode = Fast); + + StringRef getPassName() const override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +} // namespace llvm +#endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 5e16a405f375..0203af32e389 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -215,6 +215,10 @@ static bool isVectorRegisterBank(const RegisterBank &Bank) { return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; } +bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const { + return RB != &AMDGPU::SGPRRegBank; +} + unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, const RegisterBank &Src, unsigned Size) const { @@ -846,10 +850,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { - for (MachineOperand &Op : MI.uses()) { - if (!Op.isReg() || Op.isDef()) - continue; - + for (MachineOperand &Op : MI.all_uses()) { Register OldReg = Op.getReg(); if (!SGPROperandRegs.count(OldReg)) continue; @@ -1233,31 +1234,18 @@ bool AMDGPURegisterBankInfo::applyMappingImage( return true; } -static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI, - Register Reg) { - MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); - if (!Def) - return Reg; - - // TODO: Guard against this being an implicit def - return Def->getOperand(0).getReg(); -} - // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store // the three offsets (voffset, soffset and instoffset) -static unsigned setBufferOffsets(MachineIRBuilder &B, - const AMDGPURegisterBankInfo &RBI, - Register CombinedOffset, Register &VOffsetReg, - Register &SOffsetReg, int64_t &InstOffsetVal, - Align Alignment) { +unsigned AMDGPURegisterBankInfo::setBufferOffsets( + MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg, + Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const { const LLT S32 = LLT::scalar(32); MachineRegisterInfo *MRI = B.getMRI(); if (std::optional<int64_t> Imm = getIConstantVRegSExtVal(CombinedOffset, *MRI)) { uint32_t SOffset, ImmOffset; - if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget, - Alignment)) { + if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) { VOffsetReg = B.buildConstant(S32, 0).getReg(0); SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); InstOffsetVal = ImmOffset; @@ -1275,9 +1263,9 @@ static unsigned setBufferOffsets(MachineIRBuilder &B, AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); uint32_t SOffset, ImmOffset; - if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, - &RBI.Subtarget, Alignment)) { - if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { + if ((int)Offset > 0 && + TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) { + if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) { VOffsetReg = Base; SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); @@ -1298,11 +1286,11 @@ static unsigned setBufferOffsets(MachineIRBuilder &B, // Handle the variable sgpr + vgpr case. MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI); if (Add && (int)Offset >= 0) { - Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg()); - Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg()); + Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI); + Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI); - const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI); - const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI); + const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI); + const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI); if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { VOffsetReg = Src0; @@ -1319,7 +1307,7 @@ static unsigned setBufferOffsets(MachineIRBuilder &B, // Ensure we have a VGPR for the combined offset. This could be an issue if we // have an SGPR offset and a VGPR resource. - if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { + if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) { VOffsetReg = CombinedOffset; } else { VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0); @@ -1369,8 +1357,8 @@ bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( Register VOffset; int64_t ImmOffset = 0; - unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(), - VOffset, SOffset, ImmOffset, Alignment); + unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset, + SOffset, ImmOffset, Alignment); // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we // can, but we need to track an MMO for that. @@ -1804,7 +1792,7 @@ getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { std::pair<Register, unsigned> AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const { - const unsigned MaxImm = 4095; + const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(); Register BaseReg; unsigned ImmOffset; const LLT S32 = LLT::scalar(32); @@ -1815,13 +1803,14 @@ AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, unsigned C1 = 0; if (ImmOffset != 0) { - // If the immediate value is too big for the immoffset field, put the value - // and -4096 into the immoffset field so that the value that is copied/added - // for the voffset field is a multiple of 4096, and it stands more chance - // of being CSEd with the copy/add for another similar load/store. - // However, do not do that rounding down to a multiple of 4096 if that is a - // negative number, as it appears to be illegal to have a negative offset - // in the vgpr, even if adding the immediate offset makes it positive. + // If the immediate value is too big for the immoffset field, put only bits + // that would normally fit in the immoffset field. The remaining value that + // is copied/added for the voffset field is a large power of 2, and it + // stands more chance of being CSEd with the copy/add for another similar + // load/store. + // However, do not do that rounding down if that is a negative + // number, as it appears to be illegal to have a negative offset in the + // vgpr, even if adding the immediate offset makes it positive. unsigned Overflow = ImmOffset & ~MaxImm; ImmOffset -= Overflow; if ((int32_t)Overflow < 0) { @@ -3016,6 +3005,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl( case Intrinsic::amdgcn_ubfe: applyMappingBFE(OpdMapper, false); return; + case Intrinsic::amdgcn_inverse_ballot: + applyDefaultMapping(OpdMapper); + constrainOpWithReadfirstlane(MI, MRI, 2); // Mask + return; case Intrinsic::amdgcn_ballot: // Use default handling and insert copy to vcc source. break; @@ -3082,14 +3075,16 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(MI, MRI, 2); return; } - case Intrinsic::amdgcn_raw_buffer_load_lds: { + case Intrinsic::amdgcn_raw_buffer_load_lds: + case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: { applyDefaultMapping(OpdMapper); constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc constrainOpWithReadfirstlane(MI, MRI, 2); // M0 constrainOpWithReadfirstlane(MI, MRI, 5); // soffset return; } - case Intrinsic::amdgcn_struct_buffer_load_lds: { + case Intrinsic::amdgcn_struct_buffer_load_lds: + case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { applyDefaultMapping(OpdMapper); constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc constrainOpWithReadfirstlane(MI, MRI, 2); // M0 @@ -3745,6 +3740,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_FPEXT: case AMDGPU::G_FEXP2: case AMDGPU::G_FLOG2: + case AMDGPU::G_FLDEXP: case AMDGPU::G_FMINNUM: case AMDGPU::G_FMAXNUM: case AMDGPU::G_FMINNUM_IEEE: @@ -3755,6 +3751,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_STRICT_FSUB: case AMDGPU::G_STRICT_FMUL: case AMDGPU::G_STRICT_FMA: + case AMDGPU::G_STRICT_FLDEXP: case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? case AMDGPU::G_FSHR: // TODO: Expand for scalar case AMDGPU::G_AMDGPU_FMIN_LEGACY: @@ -3766,6 +3763,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: case AMDGPU::G_AMDGPU_SMED3: + case AMDGPU::G_AMDGPU_FMED3: return getDefaultMappingVOP(MI); case AMDGPU::G_UMULH: case AMDGPU::G_SMULH: { @@ -4209,6 +4207,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_sin: case Intrinsic::amdgcn_cos: case Intrinsic::amdgcn_log_clamp: + case Intrinsic::amdgcn_log: + case Intrinsic::amdgcn_exp2: case Intrinsic::amdgcn_rcp: case Intrinsic::amdgcn_rcp_legacy: case Intrinsic::amdgcn_sqrt: @@ -4217,7 +4217,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_rsq_clamp: case Intrinsic::amdgcn_fmul_legacy: case Intrinsic::amdgcn_fma_legacy: - case Intrinsic::amdgcn_ldexp: case Intrinsic::amdgcn_frexp_mant: case Intrinsic::amdgcn_frexp_exp: case Intrinsic::amdgcn_fract: @@ -4506,6 +4505,25 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); break; } + case Intrinsic::amdgcn_inverse_ballot: { + // This must be an SGPR, but accept a VGPR. + Register MaskReg = MI.getOperand(2).getReg(); + unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits(); + unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); + OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize); + break; + } + case Intrinsic::amdgcn_wave_reduce_umin: + case Intrinsic::amdgcn_wave_reduce_umax: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); + unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); + auto regBankID = + isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; + OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize); + break; + } } break; } @@ -4636,7 +4654,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case Intrinsic::amdgcn_raw_buffer_load: - case Intrinsic::amdgcn_raw_tbuffer_load: { + case Intrinsic::amdgcn_raw_ptr_buffer_load: + case Intrinsic::amdgcn_raw_tbuffer_load: + case Intrinsic::amdgcn_raw_ptr_tbuffer_load: { // FIXME: Should make intrinsic ID the last operand of the instruction, // then this would be the same as store OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); @@ -4645,7 +4665,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); break; } - case Intrinsic::amdgcn_raw_buffer_load_lds: { + case Intrinsic::amdgcn_raw_buffer_load_lds: + case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: { OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); @@ -4653,8 +4674,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case Intrinsic::amdgcn_raw_buffer_store: + case Intrinsic::amdgcn_raw_ptr_buffer_store: case Intrinsic::amdgcn_raw_buffer_store_format: - case Intrinsic::amdgcn_raw_tbuffer_store: { + case Intrinsic::amdgcn_raw_ptr_buffer_store_format: + case Intrinsic::amdgcn_raw_tbuffer_store: + case Intrinsic::amdgcn_raw_ptr_tbuffer_store: { OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); @@ -4662,7 +4686,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case Intrinsic::amdgcn_struct_buffer_load: - case Intrinsic::amdgcn_struct_tbuffer_load: { + case Intrinsic::amdgcn_struct_ptr_buffer_load: + case Intrinsic::amdgcn_struct_tbuffer_load: + case Intrinsic::amdgcn_struct_ptr_tbuffer_load: { OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); @@ -4670,7 +4696,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); break; } - case Intrinsic::amdgcn_struct_buffer_load_lds: { + case Intrinsic::amdgcn_struct_buffer_load_lds: + case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); @@ -4679,7 +4706,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case Intrinsic::amdgcn_struct_buffer_store: - case Intrinsic::amdgcn_struct_tbuffer_store: { + case Intrinsic::amdgcn_struct_ptr_buffer_store: + case Intrinsic::amdgcn_struct_tbuffer_store: + case Intrinsic::amdgcn_struct_ptr_tbuffer_store: { OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); @@ -4828,9 +4857,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_ATOMICRMW_UMAX: case AMDGPU::G_ATOMICRMW_UMIN: case AMDGPU::G_ATOMICRMW_FADD: + case AMDGPU::G_ATOMICRMW_UINC_WRAP: + case AMDGPU::G_ATOMICRMW_UDEC_WRAP: case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: - case AMDGPU::G_AMDGPU_ATOMIC_INC: - case AMDGPU::G_AMDGPU_ATOMIC_DEC: case AMDGPU::G_AMDGPU_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_ATOMIC_FMAX: { OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index c9741c2202e6..78214d7a1058 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -82,6 +82,9 @@ public: applyMappingImage(MachineInstr &MI, const OperandsMapper &OpdMapper, MachineRegisterInfo &MRI, int RSrcIdx) const; + unsigned setBufferOffsets(MachineIRBuilder &B, Register CombinedOffset, + Register &VOffsetReg, Register &SOffsetReg, + int64_t &InstOffsetVal, Align Alignment) const; bool applyMappingSBufferLoad(const OperandsMapper &OpdMapper) const; bool applyMappingBFE(const OperandsMapper &OpdMapper, bool Signed) const; @@ -165,6 +168,8 @@ public: public: AMDGPURegisterBankInfo(const GCNSubtarget &STI); + bool isDivergentRegBank(const RegisterBank *RB) const override; + unsigned copyCost(const RegisterBank &A, const RegisterBank &B, unsigned Size) const override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp deleted file mode 100644 index b7521540c020..000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp +++ /dev/null @@ -1,156 +0,0 @@ -//===- AMDGPUReleaseVGPRs.cpp - Automatically release vgprs on GFX11+ -----===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Insert S_SENDMSG instructions to release vgprs on GFX11+. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "GCNSubtarget.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIDefines.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineOperand.h" -#include <optional> -using namespace llvm; - -#define DEBUG_TYPE "release-vgprs" - -namespace { - -class AMDGPUReleaseVGPRs : public MachineFunctionPass { -public: - static char ID; - - AMDGPUReleaseVGPRs() : MachineFunctionPass(ID) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - // Track if the last instruction referencing a vgpr in a MBB is a VMEM - // store. Because this pass is late in the pipeline, it is expected that the - // last vgpr use will likely be one of vmem store, ds, exp. - // Loads and others vgpr operations would have been - // deleted by this point, except for complex control flow involving loops. - // This is why we are just testing the type of instructions rather - // than the operands. - class LastVGPRUseIsVMEMStore { - BitVector BlockVMEMStore; - - static std::optional<bool> - lastVGPRUseIsStore(const MachineBasicBlock &MBB) { - for (auto &MI : reverse(MBB.instrs())) { - // If it's a VMEM store, a VGPR will be used, return true. - if ((SIInstrInfo::isVMEM(MI) || SIInstrInfo::isFLAT(MI)) && - MI.mayStore()) - return true; - - // If it's referencing a VGPR but is not a VMEM store, return false. - if (SIInstrInfo::isDS(MI) || SIInstrInfo::isEXP(MI) || - SIInstrInfo::isVMEM(MI) || SIInstrInfo::isFLAT(MI) || - SIInstrInfo::isVALU(MI)) - return false; - } - // Wait until the values are propagated from the predecessors - return std::nullopt; - } - - public: - LastVGPRUseIsVMEMStore(const MachineFunction &MF) - : BlockVMEMStore(MF.getNumBlockIDs()) { - - df_iterator_default_set<const MachineBasicBlock *> Visited; - SmallVector<const MachineBasicBlock *> EndWithVMEMStoreBlocks; - - for (const auto &MBB : MF) { - auto LastUseIsStore = lastVGPRUseIsStore(MBB); - if (!LastUseIsStore.has_value()) - continue; - - if (*LastUseIsStore) { - EndWithVMEMStoreBlocks.push_back(&MBB); - } else { - Visited.insert(&MBB); - } - } - - for (const auto *MBB : EndWithVMEMStoreBlocks) { - for (const auto *Succ : depth_first_ext(MBB, Visited)) { - BlockVMEMStore[Succ->getNumber()] = true; - } - } - } - - // Return true if the last instruction referencing a vgpr in this MBB - // is a VMEM store, otherwise return false. - bool isLastVGPRUseVMEMStore(const MachineBasicBlock &MBB) const { - return BlockVMEMStore[MBB.getNumber()]; - } - }; - - static bool - runOnMachineBasicBlock(MachineBasicBlock &MBB, const SIInstrInfo *SII, - const LastVGPRUseIsVMEMStore &BlockVMEMStore) { - - bool Changed = false; - - for (auto &MI : MBB.terminators()) { - // Look for S_ENDPGM instructions - if (MI.getOpcode() == AMDGPU::S_ENDPGM || - MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { - // If the last instruction using a VGPR in the block is a VMEM store, - // release VGPRs. The VGPRs release will be placed just before ending - // the program - if (BlockVMEMStore.isLastVGPRUseVMEMStore(MBB)) { - BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_SENDMSG)) - .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus); - Changed = true; - } - } - } - - return Changed; - } - - bool runOnMachineFunction(MachineFunction &MF) override { - Function &F = MF.getFunction(); - if (skipFunction(F) || !AMDGPU::isEntryFunctionCC(F.getCallingConv())) - return false; - - // This pass only runs on GFX11+ - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - if (ST.getGeneration() < AMDGPUSubtarget::GFX11) - return false; - - LLVM_DEBUG(dbgs() << "AMDGPUReleaseVGPRs running on " << MF.getName() - << "\n"); - - const SIInstrInfo *SII = ST.getInstrInfo(); - LastVGPRUseIsVMEMStore BlockVMEMStore(MF); - - bool Changed = false; - for (auto &MBB : MF) { - Changed |= runOnMachineBasicBlock(MBB, SII, BlockVMEMStore); - } - - return Changed; - } -}; - -} // namespace - -char AMDGPUReleaseVGPRs::ID = 0; - -char &llvm::AMDGPUReleaseVGPRsID = AMDGPUReleaseVGPRs::ID; - -INITIALIZE_PASS(AMDGPUReleaseVGPRs, DEBUG_TYPE, "Release VGPRs", false, false) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp new file mode 100644 index 000000000000..580352fb8cf4 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp @@ -0,0 +1,186 @@ +//===-- AMDGPURemoveIncompatibleFunctions.cpp -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass replaces all uses of functions that use GPU features +/// incompatible with the current GPU with null then deletes the function. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "amdgpu-remove-incompatible-functions" + +using namespace llvm; + +namespace llvm { +extern const SubtargetFeatureKV + AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures - 1]; +} + +namespace { + +using Generation = AMDGPUSubtarget::Generation; + +class AMDGPURemoveIncompatibleFunctions : public ModulePass { +public: + static char ID; + + AMDGPURemoveIncompatibleFunctions(const TargetMachine *TM = nullptr) + : ModulePass(ID), TM(TM) { + assert(TM && "No TargetMachine!"); + } + + StringRef getPassName() const override { + return "AMDGPU Remove Incompatible Functions"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override {} + + /// Checks a single function, returns true if the function must be deleted. + bool checkFunction(Function &F); + + bool runOnModule(Module &M) override { + assert(TM->getTargetTriple().isAMDGCN()); + + SmallVector<Function *, 4> FnsToDelete; + for (Function &F : M) { + if (checkFunction(F)) + FnsToDelete.push_back(&F); + } + + for (Function *F : FnsToDelete) { + F->replaceAllUsesWith(ConstantPointerNull::get(F->getType())); + F->eraseFromParent(); + } + return !FnsToDelete.empty(); + } + +private: + const TargetMachine *TM = nullptr; +}; + +StringRef getFeatureName(unsigned Feature) { + for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV) + if (Feature == KV.Value) + return KV.Key; + + llvm_unreachable("Unknown Target feature"); +} + +const SubtargetSubTypeKV *getGPUInfo(const GCNSubtarget &ST, + StringRef GPUName) { + for (const SubtargetSubTypeKV &KV : ST.getAllProcessorDescriptions()) + if (StringRef(KV.Key) == GPUName) + return &KV; + + return nullptr; +} + +constexpr unsigned FeaturesToCheck[] = { + AMDGPU::FeatureGFX11Insts, AMDGPU::FeatureGFX10Insts, + AMDGPU::FeatureGFX9Insts, AMDGPU::FeatureGFX8Insts, + AMDGPU::FeatureDPP, AMDGPU::Feature16BitInsts, + AMDGPU::FeatureDot1Insts, AMDGPU::FeatureDot2Insts, + AMDGPU::FeatureDot3Insts, AMDGPU::FeatureDot4Insts, + AMDGPU::FeatureDot5Insts, AMDGPU::FeatureDot6Insts, + AMDGPU::FeatureDot7Insts, AMDGPU::FeatureDot8Insts, +}; + +FeatureBitset expandImpliedFeatures(const FeatureBitset &Features) { + FeatureBitset Result = Features; + for (const SubtargetFeatureKV &FE : AMDGPUFeatureKV) { + if (Features.test(FE.Value) && FE.Implies.any()) + Result |= expandImpliedFeatures(FE.Implies.getAsBitset()); + } + return Result; +} + +void reportFunctionRemoved(Function &F, unsigned Feature) { + OptimizationRemarkEmitter ORE(&F); + ORE.emit([&]() { + // Note: we print the function name as part of the diagnostic because if + // debug info is not present, users get "<unknown>:0:0" as the debug + // loc. If we didn't print the function name there would be no way to + // tell which function got removed. + return OptimizationRemark(DEBUG_TYPE, "AMDGPUIncompatibleFnRemoved", &F) + << "removing function '" << F.getName() << "': +" + << getFeatureName(Feature) + << " is not supported on the current target"; + }); + return; +} +} // end anonymous namespace + +bool AMDGPURemoveIncompatibleFunctions::checkFunction(Function &F) { + if (F.isDeclaration()) + return false; + + const GCNSubtarget *ST = + static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F)); + + // Check the GPU isn't generic. Generic is used for testing only + // and we don't want this pass to interfere with it. + StringRef GPUName = ST->getCPU(); + if (GPUName.empty() || GPUName.contains("generic")) + return false; + + // Try to fetch the GPU's info. If we can't, it's likely an unknown processor + // so just bail out. + const SubtargetSubTypeKV *GPUInfo = getGPUInfo(*ST, GPUName); + if (!GPUInfo) + return false; + + // Get all the features implied by the current GPU, and recursively expand + // the features that imply other features. + // + // e.g. GFX90A implies FeatureGFX9, and FeatureGFX9 implies a whole set of + // other features. + const FeatureBitset GPUFeatureBits = + expandImpliedFeatures(GPUInfo->Implies.getAsBitset()); + + // Now that the have a FeatureBitset containing all possible features for + // the chosen GPU, check our list of "suspicious" features. + + // Check that the user didn't enable any features that aren't part of that + // GPU's feature set. We only check a predetermined set of features. + for (unsigned Feature : FeaturesToCheck) { + if (ST->hasFeature(Feature) && !GPUFeatureBits.test(Feature)) { + reportFunctionRemoved(F, Feature); + return true; + } + } + + // Delete FeatureWavefrontSize32 functions for + // gfx9 and below targets that don't support the mode. + // gfx10+ is implied to support both wave32 and 64 features. + // They are not in the feature set. So, we need a separate check + if (ST->getGeneration() < AMDGPUSubtarget::GFX10 && + ST->hasFeature(AMDGPU::FeatureWavefrontSize32)) { + reportFunctionRemoved(F, AMDGPU::FeatureWavefrontSize32); + return true; + } + return false; +} + +INITIALIZE_PASS(AMDGPURemoveIncompatibleFunctions, DEBUG_TYPE, + "AMDGPU Remove Incompatible Functions", false, false) + +char AMDGPURemoveIncompatibleFunctions::ID = 0; + +ModulePass * +llvm::createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *TM) { + return new AMDGPURemoveIncompatibleFunctions(TM); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp deleted file mode 100644 index 299ac106ebee..000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp +++ /dev/null @@ -1,648 +0,0 @@ -//===-- AMDGPUReplaceLDSUseWithPointer.cpp --------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This pass replaces all the uses of LDS within non-kernel functions by -// corresponding pointer counter-parts. -// -// The main motivation behind this pass is - to *avoid* subsequent LDS lowering -// pass from directly packing LDS (assume large LDS) into a struct type which -// would otherwise cause allocating huge memory for struct instance within every -// kernel. -// -// Brief sketch of the algorithm implemented in this pass is as below: -// -// 1. Collect all the LDS defined in the module which qualify for pointer -// replacement, say it is, LDSGlobals set. -// -// 2. Collect all the reachable callees for each kernel defined in the module, -// say it is, KernelToCallees map. -// -// 3. FOR (each global GV from LDSGlobals set) DO -// LDSUsedNonKernels = Collect all non-kernel functions which use GV. -// FOR (each kernel K in KernelToCallees map) DO -// ReachableCallees = KernelToCallees[K] -// ReachableAndLDSUsedCallees = -// SetIntersect(LDSUsedNonKernels, ReachableCallees) -// IF (ReachableAndLDSUsedCallees is not empty) THEN -// Pointer = Create a pointer to point-to GV if not created. -// Initialize Pointer to point-to GV within kernel K. -// ENDIF -// ENDFOR -// Replace all uses of GV within non kernel functions by Pointer. -// ENFOR -// -// LLVM IR example: -// -// Input IR: -// -// @lds = internal addrspace(3) global [4 x i32] undef, align 16 -// -// define internal void @f0() { -// entry: -// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds, -// i32 0, i32 0 -// ret void -// } -// -// define protected amdgpu_kernel void @k0() { -// entry: -// call void @f0() -// ret void -// } -// -// Output IR: -// -// @lds = internal addrspace(3) global [4 x i32] undef, align 16 -// @lds.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2 -// -// define internal void @f0() { -// entry: -// %0 = load i16, i16 addrspace(3)* @lds.ptr, align 2 -// %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0 -// %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)* -// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2, -// i32 0, i32 0 -// ret void -// } -// -// define protected amdgpu_kernel void @k0() { -// entry: -// store i16 ptrtoint ([4 x i32] addrspace(3)* @lds to i16), -// i16 addrspace(3)* @lds.ptr, align 2 -// call void @f0() -// ret void -// } -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "GCNSubtarget.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "Utils/AMDGPUMemoryUtils.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SetOperations.h" -#include "llvm/Analysis/CallGraph.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InlineAsm.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicsAMDGPU.h" -#include "llvm/IR/ReplaceConstant.h" -#include "llvm/InitializePasses.h" -#include "llvm/Pass.h" -#include "llvm/Support/Debug.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/ModuleUtils.h" -#include <algorithm> -#include <vector> - -#define DEBUG_TYPE "amdgpu-replace-lds-use-with-pointer" - -using namespace llvm; - -namespace { - -namespace AMDGPU { -/// Collect all the instructions where user \p U belongs to. \p U could be -/// instruction itself or it could be a constant expression which is used within -/// an instruction. If \p CollectKernelInsts is true, collect instructions only -/// from kernels, otherwise collect instructions only from non-kernel functions. -DenseMap<Function *, SmallPtrSet<Instruction *, 8>> -getFunctionToInstsMap(User *U, bool CollectKernelInsts); - -SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV); - -} // namespace AMDGPU - -class ReplaceLDSUseImpl { - Module &M; - LLVMContext &Ctx; - const DataLayout &DL; - Constant *LDSMemBaseAddr; - - DenseMap<GlobalVariable *, GlobalVariable *> LDSToPointer; - DenseMap<GlobalVariable *, SmallPtrSet<Function *, 8>> LDSToNonKernels; - DenseMap<Function *, SmallPtrSet<Function *, 8>> KernelToCallees; - DenseMap<Function *, SmallPtrSet<GlobalVariable *, 8>> KernelToLDSPointers; - DenseMap<Function *, BasicBlock *> KernelToInitBB; - DenseMap<Function *, DenseMap<GlobalVariable *, Value *>> - FunctionToLDSToReplaceInst; - - // Collect LDS which requires their uses to be replaced by pointer. - std::vector<GlobalVariable *> collectLDSRequiringPointerReplace() { - // Collect LDS which requires module lowering. - std::vector<GlobalVariable *> LDSGlobals = - llvm::AMDGPU::findLDSVariablesToLower(M, nullptr); - - // Remove LDS which don't qualify for replacement. - llvm::erase_if(LDSGlobals, [&](GlobalVariable *GV) { - return shouldIgnorePointerReplacement(GV); - }); - - return LDSGlobals; - } - - // Returns true if uses of given LDS global within non-kernel functions should - // be keep as it is without pointer replacement. - bool shouldIgnorePointerReplacement(GlobalVariable *GV) { - // LDS whose size is very small and doesn't exceed pointer size is not worth - // replacing. - if (DL.getTypeAllocSize(GV->getValueType()) <= 2) - return true; - - // LDS which is not used from non-kernel function scope or it is used from - // global scope does not qualify for replacement. - LDSToNonKernels[GV] = AMDGPU::collectNonKernelAccessorsOfLDS(GV); - return LDSToNonKernels[GV].empty(); - - // FIXME: When GV is used within all (or within most of the kernels), then - // it does not make sense to create a pointer for it. - } - - // Insert new global LDS pointer which points to LDS. - GlobalVariable *createLDSPointer(GlobalVariable *GV) { - // LDS pointer which points to LDS is already created? Return it. - auto PointerEntry = LDSToPointer.insert(std::pair(GV, nullptr)); - if (!PointerEntry.second) - return PointerEntry.first->second; - - // We need to create new LDS pointer which points to LDS. - // - // Each CU owns at max 64K of LDS memory, so LDS address ranges from 0 to - // 2^16 - 1. Hence 16 bit pointer is enough to hold the LDS address. - auto *I16Ty = Type::getInt16Ty(Ctx); - GlobalVariable *LDSPointer = new GlobalVariable( - M, I16Ty, false, GlobalValue::InternalLinkage, UndefValue::get(I16Ty), - GV->getName() + Twine(".ptr"), nullptr, GlobalVariable::NotThreadLocal, - AMDGPUAS::LOCAL_ADDRESS); - - LDSPointer->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - LDSPointer->setAlignment(llvm::AMDGPU::getAlign(DL, LDSPointer)); - - // Mark that an associated LDS pointer is created for LDS. - LDSToPointer[GV] = LDSPointer; - - return LDSPointer; - } - - // Split entry basic block in such a way that only lane 0 of each wave does - // the LDS pointer initialization, and return newly created basic block. - BasicBlock *activateLaneZero(Function *K) { - // If the entry basic block of kernel K is already split, then return - // newly created basic block. - auto BasicBlockEntry = KernelToInitBB.insert(std::pair(K, nullptr)); - if (!BasicBlockEntry.second) - return BasicBlockEntry.first->second; - - // Split entry basic block of kernel K. - auto *EI = &(*(K->getEntryBlock().getFirstInsertionPt())); - IRBuilder<> Builder(EI); - - Value *Mbcnt = - Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, - {Builder.getInt32(-1), Builder.getInt32(0)}); - Value *Cond = Builder.CreateICmpEQ(Mbcnt, Builder.getInt32(0)); - Instruction *WB = cast<Instruction>( - Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {})); - - BasicBlock *NBB = SplitBlockAndInsertIfThen(Cond, WB, false)->getParent(); - - // Mark that the entry basic block of kernel K is split. - KernelToInitBB[K] = NBB; - - return NBB; - } - - // Within given kernel, initialize given LDS pointer to point to given LDS. - void initializeLDSPointer(Function *K, GlobalVariable *GV, - GlobalVariable *LDSPointer) { - // If LDS pointer is already initialized within K, then nothing to do. - auto PointerEntry = KernelToLDSPointers.insert( - std::pair(K, SmallPtrSet<GlobalVariable *, 8>())); - if (!PointerEntry.second) - if (PointerEntry.first->second.contains(LDSPointer)) - return; - - // Insert instructions at EI which initialize LDS pointer to point-to LDS - // within kernel K. - // - // That is, convert pointer type of GV to i16, and then store this converted - // i16 value within LDSPointer which is of type i16*. - auto *EI = &(*(activateLaneZero(K)->getFirstInsertionPt())); - IRBuilder<> Builder(EI); - Builder.CreateStore(Builder.CreatePtrToInt(GV, Type::getInt16Ty(Ctx)), - LDSPointer); - - // Mark that LDS pointer is initialized within kernel K. - KernelToLDSPointers[K].insert(LDSPointer); - } - - // We have created an LDS pointer for LDS, and initialized it to point-to LDS - // within all relevant kernels. Now replace all the uses of LDS within - // non-kernel functions by LDS pointer. - void replaceLDSUseByPointer(GlobalVariable *GV, GlobalVariable *LDSPointer) { - SmallVector<User *, 8> LDSUsers(GV->users()); - for (auto *U : LDSUsers) { - // When `U` is a constant expression, it is possible that same constant - // expression exists within multiple instructions, and within multiple - // non-kernel functions. Collect all those non-kernel functions and all - // those instructions within which `U` exist. - auto FunctionToInsts = - AMDGPU::getFunctionToInstsMap(U, false /*=CollectKernelInsts*/); - - for (const auto &FunctionToInst : FunctionToInsts) { - Function *F = FunctionToInst.first; - auto &Insts = FunctionToInst.second; - for (auto *I : Insts) { - // If `U` is a constant expression, then we need to break the - // associated instruction into a set of separate instructions by - // converting constant expressions into instructions. - SmallPtrSet<Instruction *, 8> UserInsts; - - if (U == I) { - // `U` is an instruction, conversion from constant expression to - // set of instructions is *not* required. - UserInsts.insert(I); - } else { - // `U` is a constant expression, convert it into corresponding set - // of instructions. - auto *CE = cast<ConstantExpr>(U); - convertConstantExprsToInstructions(I, CE, &UserInsts); - } - - // Go through all the user instructions, if LDS exist within them as - // an operand, then replace it by replace instruction. - for (auto *II : UserInsts) { - auto *ReplaceInst = getReplacementInst(F, GV, LDSPointer); - II->replaceUsesOfWith(GV, ReplaceInst); - } - } - } - } - } - - // Create a set of replacement instructions which together replace LDS within - // non-kernel function F by accessing LDS indirectly using LDS pointer. - Value *getReplacementInst(Function *F, GlobalVariable *GV, - GlobalVariable *LDSPointer) { - // If the instruction which replaces LDS within F is already created, then - // return it. - auto LDSEntry = FunctionToLDSToReplaceInst.insert( - std::pair(F, DenseMap<GlobalVariable *, Value *>())); - if (!LDSEntry.second) { - auto ReplaceInstEntry = - LDSEntry.first->second.insert(std::pair(GV, nullptr)); - if (!ReplaceInstEntry.second) - return ReplaceInstEntry.first->second; - } - - // Get the instruction insertion point within the beginning of the entry - // block of current non-kernel function. - auto *EI = &(*(F->getEntryBlock().getFirstInsertionPt())); - IRBuilder<> Builder(EI); - - // Insert required set of instructions which replace LDS within F. - auto *V = Builder.CreateBitCast( - Builder.CreateGEP( - Builder.getInt8Ty(), LDSMemBaseAddr, - Builder.CreateLoad(LDSPointer->getValueType(), LDSPointer)), - GV->getType()); - - // Mark that the replacement instruction which replace LDS within F is - // created. - FunctionToLDSToReplaceInst[F][GV] = V; - - return V; - } - -public: - ReplaceLDSUseImpl(Module &M) - : M(M), Ctx(M.getContext()), DL(M.getDataLayout()) { - LDSMemBaseAddr = Constant::getIntegerValue( - PointerType::get(Type::getInt8Ty(M.getContext()), - AMDGPUAS::LOCAL_ADDRESS), - APInt(32, 0)); - } - - // Entry-point function which interface ReplaceLDSUseImpl with outside of the - // class. - bool replaceLDSUse(); - -private: - // For a given LDS from collected LDS globals set, replace its non-kernel - // function scope uses by pointer. - bool replaceLDSUse(GlobalVariable *GV); -}; - -// For given LDS from collected LDS globals set, replace its non-kernel function -// scope uses by pointer. -bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) { - // Holds all those non-kernel functions within which LDS is being accessed. - SmallPtrSet<Function *, 8> &LDSAccessors = LDSToNonKernels[GV]; - - // The LDS pointer which points to LDS and replaces all the uses of LDS. - GlobalVariable *LDSPointer = nullptr; - - // Traverse through each kernel K, check and if required, initialize the - // LDS pointer to point to LDS within K. - for (const auto &KernelToCallee : KernelToCallees) { - Function *K = KernelToCallee.first; - SmallPtrSet<Function *, 8> Callees = KernelToCallee.second; - - // Compute reachable and LDS used callees for kernel K. - set_intersect(Callees, LDSAccessors); - - // None of the LDS accessing non-kernel functions are reachable from - // kernel K. Hence, no need to initialize LDS pointer within kernel K. - if (Callees.empty()) - continue; - - // We have found reachable and LDS used callees for kernel K, and we need to - // initialize LDS pointer within kernel K, and we need to replace LDS use - // within those callees by LDS pointer. - // - // But, first check if LDS pointer is already created, if not create one. - LDSPointer = createLDSPointer(GV); - - // Initialize LDS pointer to point to LDS within kernel K. - initializeLDSPointer(K, GV, LDSPointer); - } - - // We have not found reachable and LDS used callees for any of the kernels, - // and hence we have not created LDS pointer. - if (!LDSPointer) - return false; - - // We have created an LDS pointer for LDS, and initialized it to point-to LDS - // within all relevant kernels. Now replace all the uses of LDS within - // non-kernel functions by LDS pointer. - replaceLDSUseByPointer(GV, LDSPointer); - - return true; -} - -namespace AMDGPU { - -// An helper class for collecting all reachable callees for each kernel defined -// within the module. -class CollectReachableCallees { - Module &M; - CallGraph CG; - SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions; - - // Collect all address taken functions within the module. - void collectAddressTakenFunctions() { - auto *ECNode = CG.getExternalCallingNode(); - - for (const auto &GI : *ECNode) { - auto *CGN = GI.second; - auto *F = CGN->getFunction(); - if (!F || F->isDeclaration() || llvm::AMDGPU::isKernelCC(F)) - continue; - AddressTakenFunctions.insert(CGN); - } - } - - // For given kernel, collect all its reachable non-kernel functions. - SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) { - SmallPtrSet<Function *, 8> ReachableCallees; - - // Call graph node which represents this kernel. - auto *KCGN = CG[K]; - - // Go through all call graph nodes reachable from the node representing this - // kernel, visit all their call sites, if the call site is direct, add - // corresponding callee to reachable callee set, if it is indirect, resolve - // the indirect call site to potential reachable callees, add them to - // reachable callee set, and repeat the process for the newly added - // potential callee nodes. - // - // FIXME: Need to handle bit-casted function pointers. - // - SmallVector<CallGraphNode *, 8> CGNStack(depth_first(KCGN)); - SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes; - while (!CGNStack.empty()) { - auto *CGN = CGNStack.pop_back_val(); - - if (!VisitedCGNodes.insert(CGN).second) - continue; - - // Ignore call graph node which does not have associated function or - // associated function is not a definition. - if (!CGN->getFunction() || CGN->getFunction()->isDeclaration()) - continue; - - for (const auto &GI : *CGN) { - auto *RCB = cast<CallBase>(*GI.first); - auto *RCGN = GI.second; - - if (auto *DCallee = RCGN->getFunction()) { - ReachableCallees.insert(DCallee); - } else if (RCB->isIndirectCall()) { - auto *RCBFTy = RCB->getFunctionType(); - for (auto *ACGN : AddressTakenFunctions) { - auto *ACallee = ACGN->getFunction(); - if (ACallee->getFunctionType() == RCBFTy) { - ReachableCallees.insert(ACallee); - CGNStack.append(df_begin(ACGN), df_end(ACGN)); - } - } - } - } - } - - return ReachableCallees; - } - -public: - explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) { - // Collect address taken functions. - collectAddressTakenFunctions(); - } - - void collectReachableCallees( - DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) { - // Collect reachable callee set for each kernel defined in the module. - for (Function &F : M.functions()) { - if (!llvm::AMDGPU::isKernelCC(&F)) - continue; - Function *K = &F; - KernelToCallees[K] = collectReachableCallees(K); - } - } -}; - -/// Collect reachable callees for each kernel defined in the module \p M and -/// return collected callees at \p KernelToCallees. -void collectReachableCallees( - Module &M, - DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) { - CollectReachableCallees CRC{M}; - CRC.collectReachableCallees(KernelToCallees); -} - -/// For the given LDS global \p GV, visit all its users and collect all -/// non-kernel functions within which \p GV is used and return collected list of -/// such non-kernel functions. -SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) { - SmallPtrSet<Function *, 8> LDSAccessors; - SmallVector<User *, 8> UserStack(GV->users()); - SmallPtrSet<User *, 8> VisitedUsers; - - while (!UserStack.empty()) { - auto *U = UserStack.pop_back_val(); - - // `U` is already visited? continue to next one. - if (!VisitedUsers.insert(U).second) - continue; - - // `U` is a global variable which is initialized with LDS. Ignore LDS. - if (isa<GlobalValue>(U)) - return SmallPtrSet<Function *, 8>(); - - // Recursively explore constant users. - if (isa<Constant>(U)) { - append_range(UserStack, U->users()); - continue; - } - - // `U` should be an instruction, if it belongs to a non-kernel function F, - // then collect F. - Function *F = cast<Instruction>(U)->getFunction(); - if (!llvm::AMDGPU::isKernelCC(F)) - LDSAccessors.insert(F); - } - - return LDSAccessors; -} - -DenseMap<Function *, SmallPtrSet<Instruction *, 8>> -getFunctionToInstsMap(User *U, bool CollectKernelInsts) { - DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts; - SmallVector<User *, 8> UserStack; - SmallPtrSet<User *, 8> VisitedUsers; - - UserStack.push_back(U); - - while (!UserStack.empty()) { - auto *UU = UserStack.pop_back_val(); - - if (!VisitedUsers.insert(UU).second) - continue; - - if (isa<GlobalValue>(UU)) - continue; - - if (isa<Constant>(UU)) { - append_range(UserStack, UU->users()); - continue; - } - - auto *I = cast<Instruction>(UU); - Function *F = I->getFunction(); - if (CollectKernelInsts) { - if (!llvm::AMDGPU::isKernelCC(F)) { - continue; - } - } else { - if (llvm::AMDGPU::isKernelCC(F)) { - continue; - } - } - - FunctionToInsts.insert(std::pair(F, SmallPtrSet<Instruction *, 8>())); - FunctionToInsts[F].insert(I); - } - - return FunctionToInsts; -} - -} // namespace AMDGPU - -// Entry-point function which interface ReplaceLDSUseImpl with outside of the -// class. -bool ReplaceLDSUseImpl::replaceLDSUse() { - // Collect LDS which requires their uses to be replaced by pointer. - std::vector<GlobalVariable *> LDSGlobals = - collectLDSRequiringPointerReplace(); - - // No LDS to pointer-replace. Nothing to do. - if (LDSGlobals.empty()) - return false; - - // Collect reachable callee set for each kernel defined in the module. - AMDGPU::collectReachableCallees(M, KernelToCallees); - - if (KernelToCallees.empty()) { - // Either module does not have any kernel definitions, or none of the kernel - // has a call to non-kernel functions, or we could not resolve any of the - // call sites to proper non-kernel functions, because of the situations like - // inline asm calls. Nothing to replace. - return false; - } - - // For every LDS from collected LDS globals set, replace its non-kernel - // function scope use by pointer. - bool Changed = false; - for (auto *GV : LDSGlobals) - Changed |= replaceLDSUse(GV); - - return Changed; -} - -class AMDGPUReplaceLDSUseWithPointer : public ModulePass { -public: - static char ID; - - AMDGPUReplaceLDSUseWithPointer() : ModulePass(ID) { - initializeAMDGPUReplaceLDSUseWithPointerPass( - *PassRegistry::getPassRegistry()); - } - - bool runOnModule(Module &M) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<TargetPassConfig>(); - } -}; - -} // namespace - -char AMDGPUReplaceLDSUseWithPointer::ID = 0; -char &llvm::AMDGPUReplaceLDSUseWithPointerID = - AMDGPUReplaceLDSUseWithPointer::ID; - -INITIALIZE_PASS_BEGIN( - AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE, - "Replace within non-kernel function use of LDS with pointer", - false /*only look at the cfg*/, false /*analysis pass*/) -INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) -INITIALIZE_PASS_END( - AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE, - "Replace within non-kernel function use of LDS with pointer", - false /*only look at the cfg*/, false /*analysis pass*/) - -bool AMDGPUReplaceLDSUseWithPointer::runOnModule(Module &M) { - ReplaceLDSUseImpl LDSUseReplacer{M}; - return LDSUseReplacer.replaceLDSUse(); -} - -ModulePass *llvm::createAMDGPUReplaceLDSUseWithPointerPass() { - return new AMDGPUReplaceLDSUseWithPointer(); -} - -PreservedAnalyses -AMDGPUReplaceLDSUseWithPointerPass::run(Module &M, ModuleAnalysisManager &AM) { - ReplaceLDSUseImpl LDSUseReplacer{M}; - LDSUseReplacer.replaceLDSUse(); - return PreservedAnalyses::all(); -} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 31e134d42e23..804bf503e4f9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -104,6 +104,7 @@ bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) { MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); const TargetMachine &TM = TPC->getTM<TargetMachine>(); + const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo(); bool HasIndirectCall = false; CallGraph CG = CallGraph(M); @@ -111,7 +112,8 @@ bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) { // By default, for code object v5 and later, track only the minimum scratch // size - if (AMDGPU::getAmdhsaCodeObjectVersion() >= 5) { + if (AMDGPU::getCodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 || + STI.getTargetTriple().getOS() == Triple::AMDPAL) { if (!AssumedStackSizeForDynamicSizeObjects.getNumOccurrences()) AssumedStackSizeForDynamicSizeObjects = 0; if (!AssumedStackSizeForExternalCall.getNumOccurrences()) @@ -338,11 +340,9 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( break; } - if (AMDGPU::SReg_32RegClass.contains(Reg) || - AMDGPU::SReg_LO16RegClass.contains(Reg) || + if (AMDGPU::SGPR_32RegClass.contains(Reg) || + AMDGPU::SGPR_LO16RegClass.contains(Reg) || AMDGPU::SGPR_HI16RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && - "trap handler registers should not be used"); IsSGPR = true; Width = 1; } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || @@ -355,9 +355,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( IsSGPR = false; IsAGPR = true; Width = 1; - } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && - "trap handler registers should not be used"); + } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) { IsSGPR = true; Width = 2; } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { @@ -377,9 +375,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( IsSGPR = false; IsAGPR = true; Width = 3; - } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && - "trap handler registers should not be used"); + } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) { IsSGPR = true; Width = 4; } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { @@ -420,8 +416,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( IsAGPR = true; Width = 7; } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && - "trap handler registers should not be used"); IsSGPR = true; Width = 8; } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { @@ -472,8 +466,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( IsAGPR = true; Width = 12; } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { - assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && - "trap handler registers should not be used"); IsSGPR = true; Width = 16; } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { @@ -494,7 +486,15 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage( IsAGPR = true; Width = 32; } else { - llvm_unreachable("Unknown register class"); + // We only expect TTMP registers or registers that do not belong to + // any RC. + assert((AMDGPU::TTMP_32RegClass.contains(Reg) || + AMDGPU::TTMP_64RegClass.contains(Reg) || + AMDGPU::TTMP_128RegClass.contains(Reg) || + AMDGPU::TTMP_256RegClass.contains(Reg) || + AMDGPU::TTMP_512RegClass.contains(Reg) || + !TRI.getPhysRegBaseClass(Reg)) && + "Unknown register class"); } unsigned HWReg = TRI.getHWRegIndex(Reg); int MaxUsed = HWReg + Width - 1; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp index 3ff3546f4f92..2fde7afc0c14 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp @@ -46,6 +46,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/IR/AttributeMask.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/InitializePasses.h" @@ -377,19 +378,12 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { if (!OutArgIndexes.count(Arg.getArgNo())) continue; - PointerType *ArgType = cast<PointerType>(Arg.getType()); - Type *EltTy = OutArgIndexes[Arg.getArgNo()]; const auto Align = DL->getValueOrABITypeAlignment(Arg.getParamAlign(), EltTy); Value *Val = B.CreateExtractValue(StubCall, RetIdx++); - Type *PtrTy = Val->getType()->getPointerTo(ArgType->getAddressSpace()); - - // We can peek through bitcasts, so the type may not match. - Value *PtrVal = B.CreateBitCast(&Arg, PtrTy); - - B.CreateAlignedStore(Val, PtrVal, Align); + B.CreateAlignedStore(Val, &Arg, Align); } if (!RetTy->isVoidTy()) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp index ff34726fdf02..9c07851243c9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp @@ -10,7 +10,7 @@ // StructurizedCFG pass, and this pass has some additional limitation that make // it can only run after SIAnnotateControlFlow. // -// To achieve optimal code generation for AMDGPU, we assume that divergence +// To achieve optimal code generation for AMDGPU, we assume that uniformity // analysis reports the PHI in join block of divergent branch as uniform if // it has one unique uniform value plus additional undefined/poisoned incoming // value. That is to say the later compiler pipeline will ensure such PHI always @@ -56,7 +56,7 @@ // \--- #include "AMDGPU.h" -#include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" @@ -81,11 +81,11 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LegacyDivergenceAnalysis>(); + AU.addRequired<UniformityInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<LegacyDivergenceAnalysis>(); + AU.addPreserved<UniformityInfoWrapperPass>(); AU.setPreservesCFG(); } }; @@ -95,17 +95,17 @@ char AMDGPURewriteUndefForPHI::ID = 0; INITIALIZE_PASS_BEGIN(AMDGPURewriteUndefForPHI, DEBUG_TYPE, "Rewrite undef for PHI", false, false) -INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(AMDGPURewriteUndefForPHI, DEBUG_TYPE, "Rewrite undef for PHI", false, false) -bool rewritePHIs(Function &F, LegacyDivergenceAnalysis *DA, DominatorTree *DT) { +bool rewritePHIs(Function &F, UniformityInfo &UA, DominatorTree *DT) { bool Changed = false; SmallVector<PHINode *> ToBeDeleted; for (auto &BB : F) { for (auto &PHI : BB.phis()) { - if (DA->isDivergent(&PHI)) + if (UA.isDivergent(&PHI)) continue; // The unique incoming value except undef/poison for the PHI node. @@ -147,7 +147,7 @@ bool rewritePHIs(Function &F, LegacyDivergenceAnalysis *DA, DominatorTree *DT) { // TODO: We should still be able to replace undef value if the unique // value is a Constant. if (!UniqueDefinedIncoming || Undefs.empty() || - !DA->isDivergent(DominateBB->getTerminator())) + !UA.isDivergent(DominateBB->getTerminator())) continue; // We only replace the undef when DominateBB truly dominates all the @@ -171,9 +171,10 @@ bool rewritePHIs(Function &F, LegacyDivergenceAnalysis *DA, DominatorTree *DT) { } bool AMDGPURewriteUndefForPHI::runOnFunction(Function &F) { - LegacyDivergenceAnalysis *DA = &getAnalysis<LegacyDivergenceAnalysis>(); + UniformityInfo &UA = + getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - return rewritePHIs(F, DA, DT); + return rewritePHIs(F, UA, DT); } FunctionPass *llvm::createAMDGPURewriteUndefForPHIPass() { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index ca714baffe3e..317f3f21d240 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -237,8 +237,6 @@ def : SourceOfDivergence<int_amdgcn_mbcnt_lo>; def : SourceOfDivergence<int_r600_read_tidig_x>; def : SourceOfDivergence<int_r600_read_tidig_y>; def : SourceOfDivergence<int_r600_read_tidig_z>; -def : SourceOfDivergence<int_amdgcn_atomic_inc>; -def : SourceOfDivergence<int_amdgcn_atomic_dec>; def : SourceOfDivergence<int_amdgcn_global_atomic_csub>; def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>; def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>; @@ -279,6 +277,22 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_swap>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_add>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_sub>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_smin>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_umin>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_smax>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_umax>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_and>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_or>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_xor>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_inc>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_dec>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmin>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmax>; +def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cmpswap>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_sub>; @@ -295,6 +309,22 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>; def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_swap>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_add>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_sub>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_smin>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_umin>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_smax>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_umax>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_and>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_or>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_xor>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_inc>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_dec>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>; +def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>; def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>; def : SourceOfDivergence<int_amdgcn_ps_live>; def : SourceOfDivergence<int_amdgcn_live_mask>; @@ -376,6 +406,26 @@ def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>; def : SourceOfDivergence<int_amdgcn_if>; def : SourceOfDivergence<int_amdgcn_else>; def : SourceOfDivergence<int_amdgcn_loop>; +def : SourceOfDivergence<int_amdgcn_inverse_ballot>; foreach intr = AMDGPUImageDimAtomicIntrinsics in def : SourceOfDivergence<intr>; + +class AlwaysUniform<Intrinsic intr> { + Intrinsic Intr = intr; +} + +def UniformIntrinsics : GenericTable { + let FilterClass = "AlwaysUniform"; + let Fields = ["Intr"]; + + let PrimaryKey = ["Intr"]; + let PrimaryKeyName = "lookupAlwaysUniform"; +} + +def : AlwaysUniform<int_amdgcn_readfirstlane>; +def : AlwaysUniform<int_amdgcn_readlane>; +def : AlwaysUniform<int_amdgcn_icmp>; +def : AlwaysUniform<int_amdgcn_fcmp>; +def : AlwaysUniform<int_amdgcn_ballot>; +def : AlwaysUniform<int_amdgcn_if_break>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 03ccd563975f..9b50f4fa53ac 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -416,8 +416,9 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( return Requested; } -std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( - const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { +std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU( + std::pair<unsigned, unsigned> Requested, + std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { // Default minimum/maximum number of waves per execution unit. std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); @@ -429,10 +430,6 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second); Default.first = MinImpliedByFlatWorkGroupSize; - // Requested minimum/maximum number of waves per execution unit. - std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute( - F, "amdgpu-waves-per-eu", Default, true); - // Make sure requested minimum is less than requested maximum. if (Requested.second && Requested.first > Requested.second) return Default; @@ -450,6 +447,17 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( return Requested; } +std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( + const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const { + // Default minimum/maximum number of waves per execution unit. + std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU()); + + // Requested minimum/maximum number of waves per execution unit. + std::pair<unsigned, unsigned> Requested = + AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true); + return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes); +} + static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) { auto Node = Kernel.getMetadata("reqd_work_group_size"); if (Node && Node->getNumOperands() == 3) @@ -469,6 +477,15 @@ unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel, return getFlatWorkGroupSizes(Kernel).second - 1; } +bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const { + for (int I = 0; I < 3; ++I) { + if (getMaxWorkitemID(Func, I) > 0) + return false; + } + + return true; +} + bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { Function *Kernel = I->getParent()->getParent(); unsigned MinSize = 0; @@ -543,7 +560,9 @@ unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { return 16; // Assume all implicit inputs are used by default - unsigned NBytes = (AMDGPU::getAmdhsaCodeObjectVersion() >= 5) ? 256 : 56; + const Module *M = F.getParent(); + unsigned NBytes = + AMDGPU::getCodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56; return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes", NBytes); } @@ -572,9 +591,13 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, Align &MaxAlign) const { + if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL && + F.getCallingConv() != CallingConv::SPIR_KERNEL) + return 0; + uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); - unsigned ExplicitOffset = getExplicitKernelArgOffset(F); + unsigned ExplicitOffset = getExplicitKernelArgOffset(); uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; unsigned ImplicitBytes = getImplicitArgNumBytes(F); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 972f996ad85a..10ce00fe68ca 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -14,9 +14,9 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H -#include "llvm/ADT/Triple.h" #include "llvm/IR/CallingConv.h" #include "llvm/Support/Alignment.h" +#include "llvm/TargetParser/Triple.h" namespace llvm { @@ -61,6 +61,7 @@ protected: bool HasFminFmaxLegacy = true; bool EnablePromoteAlloca = false; bool HasTrigReducedRange = false; + bool FastFMAF32 = false; unsigned EUsPerCU = 4; unsigned MaxWavesPerEU = 10; unsigned LocalMemorySize = 0; @@ -107,6 +108,9 @@ public: std::pair<unsigned, unsigned> getWavesPerEU(const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const; + std::pair<unsigned, unsigned> getEffectiveWavesPerEU( + std::pair<unsigned, unsigned> WavesPerEU, + std::pair<unsigned, unsigned> FlatWorkGroupSizes) const; /// Return the amount of LDS that can be used that will not restrict the /// occupancy lower than WaveCount. @@ -195,6 +199,10 @@ public: return HasTrigReducedRange; } + bool hasFastFMAF32() const { + return FastFMAF32; + } + bool isPromoteAllocaEnabled() const { return EnablePromoteAlloca; } @@ -226,7 +234,7 @@ public: /// Returns the offset in bytes from the start of the input buffer /// of the first explicit kernel argument. - unsigned getExplicitKernelArgOffset(const Function &F) const { + unsigned getExplicitKernelArgOffset() const { switch (TargetTriple.getOS()) { case Triple::AMDHSA: case Triple::AMDPAL: @@ -269,6 +277,9 @@ public: /// 2) dimension. unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const; + /// Return true if only a single workitem can be active in a wave. + bool isSingleLaneExecution(const Function &Kernel) const; + /// Creates value range metadata on an workitemid.* intrinsic call or load. bool makeLIDRangeMetadata(Instruction *I) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 5694acf40527..f90c8e4bdddd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -19,8 +19,10 @@ #include "AMDGPUExportClustering.h" #include "AMDGPUIGroupLP.h" #include "AMDGPUMacroFusion.h" +#include "AMDGPURegBankSelect.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" +#include "AMDGPUUnifyDivergentExitNodes.h" #include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" #include "GCNVOPDUtils.h" @@ -43,7 +45,6 @@ #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" -#include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" @@ -58,7 +59,7 @@ #include "llvm/Transforms/Scalar/InferAddressSpaces.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/SimplifyLibCalls.h" -#include "llvm/Transforms/Vectorize.h" +#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include <optional> using namespace llvm; @@ -188,6 +189,11 @@ OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden, cl::desc("Run pre-RA exec mask optimizations"), cl::init(true)); +static cl::opt<bool> + LowerCtorDtor("amdgpu-lower-global-ctor-dtor", + cl::desc("Lower GPU ctor / dtors to globals on the device."), + cl::init(true), cl::Hidden); + // Option to disable vectorizer for tests. static cl::opt<bool> EnableLoadStoreVectorizer( "amdgpu-load-store-vectorizer", @@ -216,6 +222,12 @@ static cl::opt<bool> EarlyInlineAll( cl::init(false), cl::Hidden); +static cl::opt<bool> RemoveIncompatibleFunctions( + "amdgpu-enable-remove-incompatible-functions", cl::Hidden, + cl::desc("Enable removal of functions when they" + "use features not supported by the target GPU"), + cl::init(true)); + static cl::opt<bool> EnableSDWAPeephole( "amdgpu-sdwa-peephole", cl::desc("Enable SDWA peepholer"), @@ -262,12 +274,15 @@ static cl::opt<bool> OptVGPRLiveRange( cl::desc("Enable VGPR liverange optimizations for if-else structure"), cl::init(true), cl::Hidden); -// Enable atomic optimization -static cl::opt<bool> EnableAtomicOptimizations( - "amdgpu-atomic-optimizations", - cl::desc("Enable atomic optimizations"), - cl::init(false), - cl::Hidden); +static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy( + "amdgpu-atomic-optimizer-strategy", + cl::desc("Select DPP or Iterative strategy for scan"), + cl::init(ScanOptions::Iterative), + cl::values( + clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"), + clEnumValN(ScanOptions::Iterative, "Iterative", + "Use Iterative approach for scan"), + clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer"))); // Enable Mode register optimization static cl::opt<bool> EnableSIModeRegisterPass( @@ -309,11 +324,6 @@ static cl::opt<bool> EnableStructurizerWorkarounds( cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true), cl::Hidden); -static cl::opt<bool> EnableLDSReplaceWithPointer( - "amdgpu-enable-lds-replace-with-pointer", - cl::desc("Enable LDS replace with pointer pass"), cl::init(false), - cl::Hidden); - static cl::opt<bool, true> EnableLowerModuleLDS( "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"), cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true), @@ -334,9 +344,14 @@ static cl::opt<bool> EnableMaxIlpSchedStrategy( cl::desc("Enable scheduling strategy to maximize ILP for a single wave."), cl::Hidden, cl::init(false)); +static cl::opt<bool> EnableRewritePartialRegUses( + "amdgpu-enable-rewrite-partial-reg-uses", + cl::desc("Enable rewrite partial reg uses pass"), cl::init(false), + cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { // Register the target - RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); + RegisterTargetMachine<R600TargetMachine> X(getTheR600Target()); RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); PassRegistry *PR = PassRegistry::getPassRegistry(); @@ -349,6 +364,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUDAGToDAGISelPass(*PR); initializeGCNDPPCombinePass(*PR); initializeSILowerI1CopiesPass(*PR); + initializeSILowerWWMCopiesPass(*PR); initializeSILowerSGPRSpillsPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); @@ -368,24 +384,21 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPULowerKernelArgumentsPass(*PR); initializeAMDGPUPromoteKernelArgumentsPass(*PR); initializeAMDGPULowerKernelAttributesPass(*PR); - initializeAMDGPULowerIntrinsicsPass(*PR); initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); initializeAMDGPUPostLegalizerCombinerPass(*PR); initializeAMDGPUPreLegalizerCombinerPass(*PR); initializeAMDGPURegBankCombinerPass(*PR); + initializeAMDGPURegBankSelectPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUPromoteAllocaToVectorPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); initializeAMDGPULateCodeGenPreparePass(*PR); - initializeAMDGPUPropagateAttributesEarlyPass(*PR); - initializeAMDGPUPropagateAttributesLatePass(*PR); - initializeAMDGPUReplaceLDSUseWithPointerPass(*PR); + initializeAMDGPURemoveIncompatibleFunctionsPass(*PR); initializeAMDGPULowerModuleLDSPass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPURewriteUndefForPHIPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); - initializeAMDGPUReleaseVGPRsPass(*PR); initializeAMDGPUInsertDelayAluPass(*PR); initializeSIInsertHardClausesPass(*PR); initializeSIInsertWaitcntsPass(*PR); @@ -409,6 +422,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUResourceUsageAnalysisPass(*PR); initializeGCNNSAReassignPass(*PR); initializeGCNPreRAOptimizationsPass(*PR); + initializeGCNPreRALongBranchRegPass(*PR); + initializeGCNRewritePartialRegUsesPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -505,11 +520,15 @@ static StringRef computeDataLayout(const Triple &TT) { } // 32-bit private, local, and region pointers. 64-bit global, constant and - // flat, non-integral buffer fat pointers. + // flat. 160-bit non-integral fat buffer pointers that include a 128-bit + // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values + // (address space 7), and 128-bit non-integral buffer resourcees (address + // space 8) which cannot be non-trivilally accessed by LLVM memory operations + // like getelementptr. return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" - "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" - "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1" - "-ni:7"; + "-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:" + "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-" + "G1-ni:7:8"; } LLVM_READNONE @@ -584,12 +603,8 @@ void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PB.registerPipelineParsingCallback( - [this](StringRef PassName, ModulePassManager &PM, - ArrayRef<PassBuilder::PipelineElement>) { - if (PassName == "amdgpu-propagate-attributes-late") { - PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); - return true; - } + [](StringRef PassName, ModulePassManager &PM, + ArrayRef<PassBuilder::PipelineElement>) { if (PassName == "amdgpu-unify-metadata") { PM.addPass(AMDGPUUnifyMetadataPass()); return true; @@ -602,10 +617,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(AMDGPUAlwaysInlinePass()); return true; } - if (PassName == "amdgpu-replace-lds-use-with-pointer") { - PM.addPass(AMDGPUReplaceLDSUseWithPointerPass()); - return true; - } if (PassName == "amdgpu-lower-module-lds") { PM.addPass(AMDGPULowerModuleLDSPass()); return true; @@ -639,14 +650,23 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PM.addPass(AMDGPULowerKernelAttributesPass()); return true; } - if (PassName == "amdgpu-propagate-attributes-early") { - PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); - return true; - } if (PassName == "amdgpu-promote-kernel-arguments") { PM.addPass(AMDGPUPromoteKernelArgumentsPass()); return true; } + if (PassName == "amdgpu-unify-divergent-exit-nodes") { + PM.addPass(AMDGPUUnifyDivergentExitNodesPass()); + return true; + } + if (PassName == "amdgpu-atomic-optimizer") { + PM.addPass( + AMDGPUAtomicOptimizerPass(*this, AMDGPUAtomicOptimizerStrategy)); + return true; + } + if (PassName == "amdgpu-codegenprepare") { + PM.addPass(AMDGPUCodeGenPreparePass(*this)); + return true; + } return false; }); @@ -665,7 +685,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { PB.registerPipelineStartEPCallback( [this](ModulePassManager &PM, OptimizationLevel Level) { FunctionPassManager FPM; - FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this)); FPM.addPass(AMDGPUUseNativeCallsPass()); if (EnableLibCallSimplify && Level != OptimizationLevel::O0) FPM.addPass(AMDGPUSimplifyLibCallsPass(*this)); @@ -673,20 +692,19 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { }); PB.registerPipelineEarlySimplificationEPCallback( - [this](ModulePassManager &PM, OptimizationLevel Level) { + [](ModulePassManager &PM, OptimizationLevel Level) { + PM.addPass(AMDGPUPrintfRuntimeBindingPass()); + if (Level == OptimizationLevel::O0) return; PM.addPass(AMDGPUUnifyMetadataPass()); - PM.addPass(AMDGPUPrintfRuntimeBindingPass()); if (InternalizeSymbols) { PM.addPass(InternalizePass(mustPreserveGV)); - } - PM.addPass(AMDGPUPropagateAttributesLatePass(*this)); - if (InternalizeSymbols) { PM.addPass(GlobalDCEPass()); } + if (EarlyInlineAll && !EnableFunctionCalls) PM.addPass(AMDGPUAlwaysInlinePass()); }); @@ -932,7 +950,6 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { } void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { - addPass(createLICMPass()); addPass(createSeparateConstOffsetFromGEPPass()); // ReassociateGEPs exposes more opportunities for SLSR. See // the example in reassociate-geps-and-slsr.ll. @@ -956,22 +973,12 @@ void AMDGPUPassConfig::addIRPasses() { disablePass(&PatchableFunctionID); addPass(createAMDGPUPrintfRuntimeBinding()); - addPass(createAMDGPUCtorDtorLoweringLegacyPass()); - - // A call to propagate attributes pass in the backend in case opt was not run. - addPass(createAMDGPUPropagateAttributesEarlyPass(&TM)); - - addPass(createAMDGPULowerIntrinsicsPass()); + if (LowerCtorDtor) + addPass(createAMDGPUCtorDtorLoweringLegacyPass()); // Function calls are not supported, so make sure we inline everything. addPass(createAMDGPUAlwaysInlinePass()); addPass(createAlwaysInlinerLegacyPass()); - // We need to add the barrier noop pass, otherwise adding the function - // inlining pass will cause all of the PassConfigs passes to be run - // one function at a time, which means if we have a module with two - // functions, then we will generate code for the first function - // without ever running any passes on the second. - addPass(createBarrierNoopPass()); // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. if (TM.getTargetTriple().getArch() == Triple::r600) @@ -980,17 +987,16 @@ void AMDGPUPassConfig::addIRPasses() { // Replace OpenCL enqueued block function pointers with global variables. addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); - // Can increase LDS used by kernel so runs before PromoteAlloca + // Runs before PromoteAlloca so the latter can account for function uses if (EnableLowerModuleLDS) { - // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the - // pass "amdgpu-lower-module-lds", and also it required to be run only if - // "amdgpu-lower-module-lds" pass is enabled. - if (EnableLDSReplaceWithPointer) - addPass(createAMDGPUReplaceLDSUseWithPointerPass()); - addPass(createAMDGPULowerModuleLDSPass()); } + // AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run + // after their introduction + if (TM.getOptLevel() > CodeGenOpt::None) + addPass(createAMDGPUAttributorPass()); + if (TM.getOptLevel() > CodeGenOpt::None) addPass(createInferAddressSpacesPass()); @@ -1017,6 +1023,11 @@ void AMDGPUPassConfig::addIRPasses() { // TODO: May want to move later or split into an early and late one. addPass(createAMDGPUCodeGenPreparePass()); } + + // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may + // have expanded. + if (TM.getOptLevel() > CodeGenOpt::Less) + addPass(createLICMPass()); } TargetPassConfig::addIRPasses(); @@ -1039,7 +1050,8 @@ void AMDGPUPassConfig::addIRPasses() { void AMDGPUPassConfig::addCodeGenPrepare() { if (TM->getTargetTriple().getArch() == Triple::amdgcn) { - addPass(createAMDGPUAttributorPass()); + if (RemoveIncompatibleFunctions) + addPass(createAMDGPURemoveIncompatibleFunctionsPass(TM)); // FIXME: This pass adds 2 hacky attributes that can be replaced with an // analysis, and should be removed. @@ -1117,8 +1129,9 @@ bool GCNPassConfig::addPreISel() { if (TM->getOptLevel() > CodeGenOpt::None) addPass(createAMDGPULateCodeGenPreparePass()); - if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) { - addPass(createAMDGPUAtomicOptimizerPass()); + if ((TM->getOptLevel() >= CodeGenOpt::Less) && + (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) { + addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy)); } if (TM->getOptLevel() > CodeGenOpt::None) @@ -1211,7 +1224,7 @@ void GCNPassConfig::addPreRegBankSelect() { } bool GCNPassConfig::addRegBankSelect() { - addPass(new RegBankSelect()); + addPass(new AMDGPURegBankSelect()); return false; } @@ -1255,6 +1268,9 @@ void GCNPassConfig::addOptimizedRegAlloc() { if (OptExecMaskPreRA) insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); + if (EnableRewritePartialRegUses) + insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID); + if (isPassEnabled(EnablePreRAOptimizations)) insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID); @@ -1281,6 +1297,7 @@ void GCNPassConfig::addOptimizedRegAlloc() { } bool GCNPassConfig::addPreRewrite() { + addPass(&SILowerWWMCopiesID); if (EnableRegReassign) addPass(&GCNNSAReassignID); return true; @@ -1327,12 +1344,16 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() { if (!usingDefaultRegAlloc()) report_fatal_error(RegAllocOptNotSupportedMessage); + addPass(&GCNPreRALongBranchRegID); + addPass(createSGPRAllocPass(false)); // Equivalent of PEI for SGPRs. addPass(&SILowerSGPRSpillsID); addPass(createVGPRAllocPass(false)); + + addPass(&SILowerWWMCopiesID); return true; } @@ -1340,6 +1361,8 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() { if (!usingDefaultRegAlloc()) report_fatal_error(RegAllocOptNotSupportedMessage); + addPass(&GCNPreRALongBranchRegID); + addPass(createSGPRAllocPass(true)); // Commit allocated register changes. This is mostly necessary because too @@ -1398,9 +1421,6 @@ void GCNPassConfig::addPreEmitPass() { // cases. addPass(&PostRAHazardRecognizerID); - if (getOptLevel() > CodeGenOpt::Less) - addPass(&AMDGPUReleaseVGPRsID); - if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less)) addPass(&AMDGPUInsertDelayAluID); @@ -1411,6 +1431,12 @@ TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { return new GCNPassConfig(*this, PM); } +void GCNTargetMachine::registerMachineRegisterInfoCallback( + MachineFunction &MF) const { + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + MF.getRegInfo().addDelegate(MFI); +} + MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo( BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const { @@ -1465,6 +1491,13 @@ bool GCNTargetMachine::parseMachineFunctionInfo( if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy)) return true; + if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy)) + return true; + + if (parseOptionalRegister(YamlMFI.LongBranchReservedReg, + MFI->LongBranchReservedReg)) + return true; + auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { // Create a diagnostic for a the register string literal. const MemoryBuffer &Buffer = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index ce93704b78f4..2426be405a65 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -92,6 +92,8 @@ public: return true; } + void registerMachineRegisterInfoCallback(MachineFunction &MF) const override; + MachineFunctionInfo * createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F, const TargetSubtargetInfo *STI) const override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 0c3324f84b25..81d083c1c88a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -17,8 +17,11 @@ #include "AMDGPUTargetTransformInfo.h" #include "AMDGPUTargetMachine.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIModeRegisterDefaults.h" +#include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/Analysis.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/PatternMatch.h" @@ -49,11 +52,6 @@ static cl::opt<bool> UnrollRuntimeLocal( cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden); -static cl::opt<bool> UseLegacyDA( - "amdgpu-use-legacy-divergence-analysis", - cl::desc("Enable legacy divergence analysis for AMDGPU"), - cl::init(false), cl::Hidden); - static cl::opt<unsigned> UnrollMaxBlockToAnalyze( "amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), @@ -115,6 +113,9 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, // manipulations in average. UP.BEInsns += 3; + // We want to run unroll even for the loops which have been vectorized. + UP.UnrollVectorizedLoop = true; + // TODO: Do we want runtime unrolling? // Maximum alloca size than can fit registers. Reserve 16 registers. @@ -266,6 +267,10 @@ void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, BaseT::getPeelingPreferences(L, SE, PP); } +int64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const { + return 1024; +} + const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = { // Codegen control options which don't matter. AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler, @@ -291,9 +296,14 @@ GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))), TLI(ST->getTargetLowering()), CommonTTI(TM, F), IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) { - AMDGPU::SIModeRegisterDefaults Mode(F); - HasFP32Denormals = Mode.allFP32Denormals(); - HasFP64FP16Denormals = Mode.allFP64FP16Denormals(); + SIModeRegisterDefaults Mode(F); + HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign(); + HasFP64FP16Denormals = + Mode.FP64FP16Denormals != DenormalMode::getPreserveSign(); +} + +bool GCNTTIImpl::hasBranchDivergence(const Function *F) const { + return !F || !ST->isSingleLaneExecution(*F); } unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const { @@ -357,7 +367,8 @@ unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT || - AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) { + AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER || + AddrSpace == AMDGPUAS::BUFFER_RESOURCE) { return 512; } @@ -393,6 +404,10 @@ bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); } +int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const { + return 1024; +} + // FIXME: Really we would like to issue multiple 128-bit loads and stores per // iteration. Should we report a larger size and let it legalize? // @@ -472,10 +487,10 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType( } } -unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) { +unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) { // Disable unrolling if the loop is not vectorized. // TODO: Enable this again. - if (VF == 1) + if (VF.isScalar()) return 1; return 8; @@ -484,8 +499,6 @@ unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) { bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const { switch (Inst->getIntrinsicID()) { - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_fadd: @@ -775,15 +788,15 @@ GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, } InstructionCost -GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, - bool IsUnsigned, +GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, + FastMathFlags FMF, TTI::TargetCostKind CostKind) { EVT OrigTy = TLI->getValueType(DL, Ty); // Computes cost on targets that have packed math instructions(which support // 16-bit types only). if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) - return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); + return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); return LT.first * getHalfRateInstrCost(CostKind); @@ -857,11 +870,6 @@ bool GCNTTIImpl::isInlineAsmSourceOfDivergence( return false; } -/// \returns true if the new GPU divergence analysis is enabled. -bool GCNTTIImpl::useGPUDivergenceAnalysis() const { - return !UseLegacyDA; -} - bool GCNTTIImpl::isReadRegisterSourceOfDivergence( const IntrinsicInst *ReadReg) const { Metadata *MD = @@ -928,19 +936,8 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { } bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { - if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { - switch (Intrinsic->getIntrinsicID()) { - default: - return false; - case Intrinsic::amdgcn_readfirstlane: - case Intrinsic::amdgcn_readlane: - case Intrinsic::amdgcn_icmp: - case Intrinsic::amdgcn_fcmp: - case Intrinsic::amdgcn_ballot: - case Intrinsic::amdgcn_if_break: - return true; - } - } + if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) + return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID()); if (const CallInst *CI = dyn_cast<CallInst>(V)) { if (CI->isInlineAsm()) @@ -1012,8 +1009,6 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, Intrinsic::ID IID) const { switch (IID) { - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: @@ -1034,8 +1029,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *NewV) const { auto IntrID = II->getIntrinsicID(); switch (IntrID) { - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { @@ -1099,9 +1092,12 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin: { - Module *M = II->getParent()->getParent()->getParent(); Type *DestTy = II->getType(); Type *SrcTy = NewV->getType(); + unsigned NewAS = SrcTy->getPointerAddressSpace(); + if (!AMDGPU::isExtendedGlobalAddrSpace(NewAS)) + return nullptr; + Module *M = II->getModule(); Function *NewDecl = Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy}); II->setArgOperand(0, NewV); @@ -1157,8 +1153,8 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller, // FIXME: dx10_clamp can just take the caller setting, but there seems to be // no way to support merge for backend defined attributes. - AMDGPU::SIModeRegisterDefaults CallerMode(*Caller); - AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee); + SIModeRegisterDefaults CallerMode(*Caller); + SIModeRegisterDefaults CalleeMode(*Callee); if (!CallerMode.isInlineCompatible(CalleeMode)) return false; @@ -1178,34 +1174,129 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller, return true; } -unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const { - // If we have a pointer to private array passed into a function +static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, + const SITargetLowering *TLI, + const GCNTTIImpl *TTIImpl) { + const int NrOfSGPRUntilSpill = 26; + const int NrOfVGPRUntilSpill = 32; + + const DataLayout &DL = TTIImpl->getDataLayout(); + + unsigned adjustThreshold = 0; + int SGPRsInUse = 0; + int VGPRsInUse = 0; + for (const Use &A : CB->args()) { + SmallVector<EVT, 4> ValueVTs; + ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs); + for (auto ArgVT : ValueVTs) { + unsigned CCRegNum = TLI->getNumRegistersForCallingConv( + CB->getContext(), CB->getCallingConv(), ArgVT); + if (AMDGPU::isArgPassedInSGPR(CB, CB->getArgOperandNo(&A))) + SGPRsInUse += CCRegNum; + else + VGPRsInUse += CCRegNum; + } + } + + // The cost of passing function arguments through the stack: + // 1 instruction to put a function argument on the stack in the caller. + // 1 instruction to take a function argument from the stack in callee. + // 1 instruction is explicitly take care of data dependencies in callee + // function. + InstructionCost ArgStackCost(1); + ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost( + Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4), + AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency); + ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost( + Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4), + AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency); + + // The penalty cost is computed relative to the cost of instructions and does + // not model any storage costs. + adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) * + *ArgStackCost.getValue() * InlineConstants::getInstrCost(); + adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) * + *ArgStackCost.getValue() * InlineConstants::getInstrCost(); + return adjustThreshold; +} + +static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, + const DataLayout &DL) { + // If we have a pointer to a private array passed into a function // it will not be optimized out, leaving scratch usage. - // Increase the inline threshold to allow inlining in this case. - uint64_t AllocaSize = 0; + // This function calculates the total size in bytes of the memory that would + // end in scratch if the call was not inlined. + unsigned AllocaSize = 0; SmallPtrSet<const AllocaInst *, 8> AIVisited; for (Value *PtrArg : CB->args()) { PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType()); - if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS && - Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) + if (!Ty) continue; - PtrArg = getUnderlyingObject(PtrArg); - if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) { - if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) - continue; - AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); - // If the amount of stack memory is excessive we will not be able - // to get rid of the scratch anyway, bail out. - if (AllocaSize > ArgAllocaCutoff) { - AllocaSize = 0; - break; - } - } + unsigned AddrSpace = Ty->getAddressSpace(); + if (AddrSpace != AMDGPUAS::FLAT_ADDRESS && + AddrSpace != AMDGPUAS::PRIVATE_ADDRESS) + continue; + + const AllocaInst *AI = dyn_cast<AllocaInst>(getUnderlyingObject(PtrArg)); + if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second) + continue; + + AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); } - if (AllocaSize) - return ArgAllocaCost; - return 0; + return AllocaSize; +} + +unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const { + unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this); + + // Private object passed as arguments may end up in scratch usage if the call + // is not inlined. Increase the inline threshold to promote inlining. + unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL); + if (AllocaSize > 0) + Threshold += ArgAllocaCost; + return Threshold; +} + +unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB, + const AllocaInst *AI) const { + + // Below the cutoff, assume that the private memory objects would be + // optimized + auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL); + if (AllocaSize <= ArgAllocaCutoff) + return 0; + + // Above the cutoff, we give a cost to each private memory object + // depending its size. If the array can be optimized by SROA this cost is not + // added to the total-cost in the inliner cost analysis. + // + // We choose the total cost of the alloca such that their sum cancels the + // bonus given in the threshold (ArgAllocaCost). + // + // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost + // + // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier, + // the single-bb bonus and the vector-bonus. + // + // We compensate the first two multipliers, by repeating logic from the + // inliner-cost in here. The vector-bonus is 0 on AMDGPU. + static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0"); + unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier(); + + bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) { + return BB.getTerminator()->getNumSuccessors() > 1; + }); + if (SingleBB) { + Threshold += Threshold / 2; + } + + auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType()); + + // Attribute the bonus proportionally to the alloca size + unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize; + + return AllocaThresholdBonus; } void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 7862f21cfc35..1e6c5bbfc0d7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -55,6 +55,8 @@ public: void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP); + + int64_t getMaxMemIntrinsicInlineSizeThreshold() const; }; class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { @@ -69,6 +71,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { bool IsGraphics; bool HasFP32Denormals; bool HasFP64FP16Denormals; + static constexpr bool InlinerVectorBonusPercent = 0; static const FeatureBitset InlineFeatureIgnoreList; @@ -100,8 +103,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { public: explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F); - bool hasBranchDivergence() { return true; } - bool useGPUDivergenceAnalysis() const; + bool hasBranchDivergence(const Function *F = nullptr) const; void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, @@ -133,6 +135,8 @@ public: unsigned AddrSpace) const; bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const; + + int64_t getMaxMemIntrinsicInlineSizeThreshold() const; Type *getMemcpyLoopLoweringType( LLVMContext & Context, Value * Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, @@ -143,7 +147,7 @@ public: unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional<uint32_t> AtomicCpySize) const; - unsigned getMaxInterleaveFactor(unsigned VF); + unsigned getMaxInterleaveFactor(ElementCount VF); bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const; @@ -169,6 +173,32 @@ public: bool isSourceOfDivergence(const Value *V) const; bool isAlwaysUniform(const Value *V) const; + bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { + if (ToAS == AMDGPUAS::FLAT_ADDRESS) { + switch (FromAS) { + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS_32BIT: + case AMDGPUAS::LOCAL_ADDRESS: + case AMDGPUAS::PRIVATE_ADDRESS: + return true; + default: + break; + } + return false; + } + if ((FromAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && + ToAS == AMDGPUAS::CONSTANT_ADDRESS) || + (FromAS == AMDGPUAS::CONSTANT_ADDRESS && + ToAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)) + return true; + return false; + } + + bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const { + return AMDGPU::addrspacesMayAlias(AS0, AS1); + } + unsigned getFlatAddressSpace() const { // Don't bother running InferAddressSpaces pass on graphics shaders which // don't use flat addressing. @@ -188,8 +218,8 @@ public: Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const; - bool canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, - InstCombiner &IC) const; + bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, + const Value *Op1, InstCombiner &IC) const; std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const; std::optional<Value *> simplifyDemandedVectorEltsIntrinsic( @@ -209,10 +239,11 @@ public: bool areInlineCompatible(const Function *Caller, const Function *Callee) const; - unsigned getInliningThresholdMultiplier() { return 11; } + unsigned getInliningThresholdMultiplier() const { return 11; } unsigned adjustInliningThreshold(const CallBase *CB) const; + unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const; - int getInlinerVectorBonusPercent() { return 0; } + int getInlinerVectorBonusPercent() const { return InlinerVectorBonusPercent; } InstructionCost getArithmeticReductionCost( unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF, @@ -220,9 +251,9 @@ public: InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind); - InstructionCost getMinMaxReductionCost( - VectorType *Ty, VectorType *CondTy, bool IsUnsigned, - TTI::TargetCostKind CostKind); + InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, + FastMathFlags FMF, + TTI::TargetCostKind CostKind); }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index c27e69a0bcbb..9ad841c3c8a5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -19,6 +19,7 @@ // //===----------------------------------------------------------------------===// +#include "AMDGPUUnifyDivergentExitNodes.h" #include "AMDGPU.h" #include "SIDefines.h" #include "llvm/ADT/ArrayRef.h" @@ -26,9 +27,9 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/DomTreeUpdater.h" -#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" @@ -53,25 +54,33 @@ using namespace llvm; namespace { -class AMDGPUUnifyDivergentExitNodes : public FunctionPass { +class AMDGPUUnifyDivergentExitNodesImpl { private: const TargetTransformInfo *TTI = nullptr; public: - static char ID; // Pass identification, replacement for typeid - - AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) { - initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry()); - } + AMDGPUUnifyDivergentExitNodesImpl() = delete; + AMDGPUUnifyDivergentExitNodesImpl(const TargetTransformInfo *TTI) + : TTI(TTI) {} // We can preserve non-critical-edgeness when we unify function exit nodes - void getAnalysisUsage(AnalysisUsage &AU) const override; BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU, ArrayRef<BasicBlock *> ReturningBlocks, StringRef Name); - bool runOnFunction(Function &F) override; + bool run(Function &F, DominatorTree *DT, const PostDominatorTree &PDT, + const UniformityInfo &UA); }; +class AMDGPUUnifyDivergentExitNodes : public FunctionPass { +public: + static char ID; + AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) { + initializeAMDGPUUnifyDivergentExitNodesPass( + *PassRegistry::getPassRegistry()); + } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunction(Function &F) override; +}; } // end anonymous namespace char AMDGPUUnifyDivergentExitNodes::ID = 0; @@ -79,20 +88,20 @@ char AMDGPUUnifyDivergentExitNodes::ID = 0; char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID; INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, - "Unify divergent function exit nodes", false, false) + "Unify divergent function exit nodes", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, "Unify divergent function exit nodes", false, false) -void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{ +void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const { if (RequireAndPreserveDomTree) AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<PostDominatorTreeWrapperPass>(); - AU.addRequired<LegacyDivergenceAnalysis>(); + AU.addRequired<UniformityInfoWrapperPass>(); if (RequireAndPreserveDomTree) { AU.addPreserved<DominatorTreeWrapperPass>(); @@ -100,7 +109,7 @@ void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{ } // No divergent values are changed, only blocks and branch edges. - AU.addPreserved<LegacyDivergenceAnalysis>(); + AU.addPreserved<UniformityInfoWrapperPass>(); // We preserve the non-critical-edgeness property AU.addPreservedID(BreakCriticalEdgesID); @@ -114,14 +123,13 @@ void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{ /// \returns true if \p BB is reachable through only uniform branches. /// XXX - Is there a more efficient way to find this? -static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA, - BasicBlock &BB) { +static bool isUniformlyReached(const UniformityInfo &UA, BasicBlock &BB) { SmallVector<BasicBlock *, 8> Stack(predecessors(&BB)); SmallPtrSet<BasicBlock *, 8> Visited; while (!Stack.empty()) { BasicBlock *Top = Stack.pop_back_val(); - if (!DA.isUniform(Top->getTerminator())) + if (!UA.isUniform(Top->getTerminator())) return false; for (BasicBlock *Pred : predecessors(Top)) { @@ -133,7 +141,7 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA, return true; } -BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet( +BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet( Function &F, DomTreeUpdater &DTU, ArrayRef<BasicBlock *> ReturningBlocks, StringRef Name) { // Otherwise, we need to insert a new basic block into the function, add a PHI @@ -181,20 +189,14 @@ BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet( return NewRetBlock; } -bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { - DominatorTree *DT = nullptr; - if (RequireAndPreserveDomTree) - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - - auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); +bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, + const PostDominatorTree &PDT, + const UniformityInfo &UA) { if (PDT.root_size() == 0 || (PDT.root_size() == 1 && !isa<BranchInst>(PDT.getRoot()->getTerminator()))) return false; - LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>(); - TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - // Loop over all of the blocks in a function, tracking all of the blocks that // return. SmallVector<BasicBlock *, 4> ReturningBlocks; @@ -213,7 +215,7 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { // exits, we should only unify UnreachableBlocks that are not uniformly // reachable. bool HasDivergentExitBlock = llvm::any_of( - PDT.roots(), [&](auto BB) { return !isUniformlyReached(DA, *BB); }); + PDT.roots(), [&](auto BB) { return !isUniformlyReached(UA, *BB); }); for (BasicBlock *BB : PDT.roots()) { if (isa<ReturnInst>(BB->getTerminator())) { @@ -327,3 +329,30 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { unifyReturnBlockSet(F, DTU, ReturningBlocks, "UnifiedReturnBlock"); return true; } + +bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { + DominatorTree *DT = nullptr; + if (RequireAndPreserveDomTree) + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + const auto &PDT = + getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); + const auto &UA = getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); + const auto *TranformInfo = + &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + return AMDGPUUnifyDivergentExitNodesImpl(TranformInfo).run(F, DT, PDT, UA); +} + +PreservedAnalyses +AMDGPUUnifyDivergentExitNodesPass::run(Function &F, + FunctionAnalysisManager &AM) { + DominatorTree *DT = nullptr; + if (RequireAndPreserveDomTree) + DT = &AM.getResult<DominatorTreeAnalysis>(F); + + const auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); + const auto &UA = AM.getResult<UniformityInfoAnalysis>(F); + const auto *TransformInfo = &AM.getResult<TargetIRAnalysis>(F); + return AMDGPUUnifyDivergentExitNodesImpl(TransformInfo).run(F, DT, PDT, UA) + ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.h b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.h new file mode 100644 index 000000000000..2fd98a2ee1a9 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.h @@ -0,0 +1,36 @@ +//===- AMDGPUUnifyDivergentExitNodes.h ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is a variant of the UnifyFunctionExitNodes pass. Rather than ensuring +// there is at most one ret and one unreachable instruction, it ensures there is +// at most one divergent exiting block. +// +// StructurizeCFG can't deal with multi-exit regions formed by branches to +// multiple return nodes. It is not desirable to structurize regions with +// uniform branches, so unifying those to the same return block as divergent +// branches inhibits use of scalar branching. It still can't deal with the case +// where one branch goes to return, and one unreachable. Replace unreachable in +// this case with a return. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUUNIFYDIVERGENTEXITNODES_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUUNIFYDIVERGENTEXITNODES_H + +#include "AMDGPU.h" + +namespace llvm { +class AMDGPUUnifyDivergentExitNodesPass + : public PassInfoMixin<AMDGPUUnifyDivergentExitNodesPass> { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUUNIFYDIVERGENTEXITNODES_H diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 671d263a41a4..b9443559132f 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/StringSet.h" #include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/ELF.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" @@ -35,9 +36,8 @@ #include "llvm/Support/AMDGPUMetadata.h" #include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/Casting.h" -#include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/TargetParser.h" +#include "llvm/TargetParser/TargetParser.h" #include <optional> using namespace llvm; @@ -119,16 +119,16 @@ public: ImmTyInstOffset, ImmTyOffset0, ImmTyOffset1, + ImmTySMEMOffsetMod, ImmTyCPol, - ImmTySWZ, ImmTyTFE, ImmTyD16, ImmTyClampSI, ImmTyOModSI, - ImmTySdwaDstSel, - ImmTySdwaSrc0Sel, - ImmTySdwaSrc1Sel, - ImmTySdwaDstUnused, + ImmTySDWADstSel, + ImmTySDWASrc0Sel, + ImmTySDWASrc1Sel, + ImmTySDWADstUnused, ImmTyDMask, ImmTyDim, ImmTyUNorm, @@ -145,7 +145,7 @@ public: ImmTySendMsg, ImmTyInterpSlot, ImmTyInterpAttr, - ImmTyAttrChan, + ImmTyInterpAttrChan, ImmTyOpSel, ImmTyOpSelHi, ImmTyNegLo, @@ -155,7 +155,7 @@ public: ImmTyDppRowMask, ImmTyDppBankMask, ImmTyDppBoundCtrl, - ImmTyDppFi, + ImmTyDppFI, ImmTySwizzle, ImmTyGprIdxMode, ImmTyHigh, @@ -347,6 +347,8 @@ public: return isImm() && Imm.Type == ImmT; } + bool isImmLiteral() const { return isImmTy(ImmTyNone); } + bool isImmModifier() const { return isImm() && Imm.Type != ImmTyNone; } @@ -370,26 +372,25 @@ public: bool isOffset() const { return isImmTy(ImmTyOffset) && isUInt<16>(getImm()); } bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<8>(getImm()); } bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); } - + bool isSMEMOffsetMod() const { return isImmTy(ImmTySMEMOffsetMod); } bool isFlatOffset() const { return isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset); } bool isGDS() const { return isImmTy(ImmTyGDS); } bool isLDS() const { return isImmTy(ImmTyLDS); } bool isCPol() const { return isImmTy(ImmTyCPol); } - bool isSWZ() const { return isImmTy(ImmTySWZ); } bool isTFE() const { return isImmTy(ImmTyTFE); } bool isD16() const { return isImmTy(ImmTyD16); } bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); } - bool isBankMask() const { return isImmTy(ImmTyDppBankMask); } - bool isRowMask() const { return isImmTy(ImmTyDppRowMask); } + bool isDppBankMask() const { return isImmTy(ImmTyDppBankMask); } + bool isDppRowMask() const { return isImmTy(ImmTyDppRowMask); } bool isDppBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); } - bool isFI() const { return isImmTy(ImmTyDppFi); } - bool isSDWADstSel() const { return isImmTy(ImmTySdwaDstSel); } - bool isSDWASrc0Sel() const { return isImmTy(ImmTySdwaSrc0Sel); } - bool isSDWASrc1Sel() const { return isImmTy(ImmTySdwaSrc1Sel); } - bool isSDWADstUnused() const { return isImmTy(ImmTySdwaDstUnused); } + bool isDppFI() const { return isImmTy(ImmTyDppFI); } + bool isSDWADstSel() const { return isImmTy(ImmTySDWADstSel); } + bool isSDWASrc0Sel() const { return isImmTy(ImmTySDWASrc0Sel); } + bool isSDWASrc1Sel() const { return isImmTy(ImmTySDWASrc1Sel); } + bool isSDWADstUnused() const { return isImmTy(ImmTySDWADstUnused); } bool isInterpSlot() const { return isImmTy(ImmTyInterpSlot); } bool isInterpAttr() const { return isImmTy(ImmTyInterpAttr); } - bool isAttrChan() const { return isImmTy(ImmTyAttrChan); } + bool isInterpAttrChan() const { return isImmTy(ImmTyInterpAttrChan); } bool isOpSel() const { return isImmTy(ImmTyOpSel); } bool isOpSelHi() const { return isImmTy(ImmTyOpSelHi); } bool isNegLo() const { return isImmTy(ImmTyNegLo); } @@ -855,13 +856,11 @@ public: return Kind == Expression; } - bool isSoppBrTarget() const { - return isExpr() || isImm(); - } + bool isSOPPBrTarget() const { return isExpr() || isImm(); } bool isSWaitCnt() const; bool isDepCtr() const; - bool isSDelayAlu() const; + bool isSDelayALU() const; bool isHwreg() const; bool isSendMsg() const; bool isSwizzle() const; @@ -948,28 +947,11 @@ public: void addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyModifiers) const; - template <unsigned Bitwidth> - void addKImmFPOperands(MCInst &Inst, unsigned N) const; - - void addKImmFP16Operands(MCInst &Inst, unsigned N) const { - addKImmFPOperands<16>(Inst, N); - } - - void addKImmFP32Operands(MCInst &Inst, unsigned N) const { - addKImmFPOperands<32>(Inst, N); - } - void addRegOperands(MCInst &Inst, unsigned N) const; - void addBoolRegOperands(MCInst &Inst, unsigned N) const { - addRegOperands(Inst, N); - } - void addRegOrImmOperands(MCInst &Inst, unsigned N) const { if (isRegKind()) addRegOperands(Inst, N); - else if (isExpr()) - Inst.addOperand(MCOperand::createExpr(Expr)); else addImmOperands(Inst, N); } @@ -1011,15 +993,6 @@ public: addRegWithInputModsOperands(Inst, N); } - void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const { - if (isImm()) - addImmOperands(Inst, N); - else { - assert(isExpr()); - Inst.addOperand(MCOperand::createExpr(Expr)); - } - } - static void printImmTy(raw_ostream& OS, ImmTy Type) { switch (Type) { case ImmTyNone: OS << "None"; break; @@ -1032,8 +1005,8 @@ public: case ImmTyInstOffset: OS << "InstOffset"; break; case ImmTyOffset0: OS << "Offset0"; break; case ImmTyOffset1: OS << "Offset1"; break; + case ImmTySMEMOffsetMod: OS << "SMEMOffsetMod"; break; case ImmTyCPol: OS << "CPol"; break; - case ImmTySWZ: OS << "SWZ"; break; case ImmTyTFE: OS << "TFE"; break; case ImmTyD16: OS << "D16"; break; case ImmTyFORMAT: OS << "FORMAT"; break; @@ -1044,11 +1017,11 @@ public: case ImmTyDppRowMask: OS << "DppRowMask"; break; case ImmTyDppBankMask: OS << "DppBankMask"; break; case ImmTyDppBoundCtrl: OS << "DppBoundCtrl"; break; - case ImmTyDppFi: OS << "FI"; break; - case ImmTySdwaDstSel: OS << "SdwaDstSel"; break; - case ImmTySdwaSrc0Sel: OS << "SdwaSrc0Sel"; break; - case ImmTySdwaSrc1Sel: OS << "SdwaSrc1Sel"; break; - case ImmTySdwaDstUnused: OS << "SdwaDstUnused"; break; + case ImmTyDppFI: OS << "DppFI"; break; + case ImmTySDWADstSel: OS << "SDWADstSel"; break; + case ImmTySDWASrc0Sel: OS << "SDWASrc0Sel"; break; + case ImmTySDWASrc1Sel: OS << "SDWASrc1Sel"; break; + case ImmTySDWADstUnused: OS << "SDWADstUnused"; break; case ImmTyDMask: OS << "DMask"; break; case ImmTyDim: OS << "Dim"; break; case ImmTyUNorm: OS << "UNorm"; break; @@ -1064,7 +1037,7 @@ public: case ImmTySendMsg: OS << "SendMsg"; break; case ImmTyInterpSlot: OS << "InterpSlot"; break; case ImmTyInterpAttr: OS << "InterpAttr"; break; - case ImmTyAttrChan: OS << "AttrChan"; break; + case ImmTyInterpAttrChan: OS << "InterpAttrChan"; break; case ImmTyOpSel: OS << "OpSel"; break; case ImmTyOpSelHi: OS << "OpSelHi"; break; case ImmTyNegLo: OS << "NegLo"; break; @@ -1339,8 +1312,6 @@ private: unsigned RegWidth); void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, bool IsAtomic); - void cvtDSImpl(MCInst &Inst, const OperandVector &Operands, - bool IsGdsHardcoded); public: enum AMDGPUMatchResultTy { @@ -1481,6 +1452,14 @@ public: return getFeatureBits()[AMDGPU::FeatureIntClamp]; } + bool hasPartialNSAEncoding() const { + return getFeatureBits()[AMDGPU::FeaturePartialNSAEncoding]; + } + + unsigned getNSAMaxSize() const { + return AMDGPU::getNSAMaxSize(getSTI()); + } + AMDGPUTargetStreamer &getTargetStreamer() { MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer(); return static_cast<AMDGPUTargetStreamer &>(TS); @@ -1526,36 +1505,34 @@ public: uint64_t &ErrorInfo, bool MatchingInlineAsm) override; bool ParseDirective(AsmToken DirectiveID) override; - OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic, - OperandMode Mode = OperandMode_Default); + ParseStatus parseOperand(OperandVector &Operands, StringRef Mnemonic, + OperandMode Mode = OperandMode_Default); StringRef parseMnemonicSuffix(StringRef Name); bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; //bool ProcessInstruction(MCInst &Inst); - OperandMatchResultTy parseTokenOp(StringRef Name, OperandVector &Operands); + ParseStatus parseTokenOp(StringRef Name, OperandVector &Operands); - OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int); + ParseStatus parseIntWithPrefix(const char *Prefix, int64_t &Int); - OperandMatchResultTy + ParseStatus parseIntWithPrefix(const char *Prefix, OperandVector &Operands, AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, - bool (*ConvertResult)(int64_t &) = nullptr); + std::function<bool(int64_t &)> ConvertResult = nullptr); - OperandMatchResultTy - parseOperandArrayWithPrefix(const char *Prefix, - OperandVector &Operands, - AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, - bool (*ConvertResult)(int64_t&) = nullptr); + ParseStatus parseOperandArrayWithPrefix( + const char *Prefix, OperandVector &Operands, + AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, + bool (*ConvertResult)(int64_t &) = nullptr); - OperandMatchResultTy + ParseStatus parseNamedBit(StringRef Name, OperandVector &Operands, AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone); unsigned getCPolKind(StringRef Id, StringRef Mnemo, bool &Disabling) const; - OperandMatchResultTy parseCPol(OperandVector &Operands); - OperandMatchResultTy parseStringWithPrefix(StringRef Prefix, - StringRef &Value, - SMLoc &StringLoc); + ParseStatus parseCPol(OperandVector &Operands); + ParseStatus parseStringWithPrefix(StringRef Prefix, StringRef &Value, + SMLoc &StringLoc); bool isModifier(); bool isOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const; @@ -1563,42 +1540,44 @@ public: bool isNamedOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const; bool isOpcodeModifierWithVal(const AsmToken &Token, const AsmToken &NextToken) const; bool parseSP3NegModifier(); - OperandMatchResultTy parseImm(OperandVector &Operands, bool HasSP3AbsModifier = false); - OperandMatchResultTy parseReg(OperandVector &Operands); - OperandMatchResultTy parseRegOrImm(OperandVector &Operands, bool HasSP3AbsMod = false); - OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true); - OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm = true); - OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands); - OperandMatchResultTy parseRegWithIntInputMods(OperandVector &Operands); - OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands); - OperandMatchResultTy parseDfmtNfmt(int64_t &Format); - OperandMatchResultTy parseUfmt(int64_t &Format); - OperandMatchResultTy parseSymbolicSplitFormat(StringRef FormatStr, SMLoc Loc, int64_t &Format); - OperandMatchResultTy parseSymbolicUnifiedFormat(StringRef FormatStr, SMLoc Loc, int64_t &Format); - OperandMatchResultTy parseFORMAT(OperandVector &Operands); - OperandMatchResultTy parseSymbolicOrNumericFormat(int64_t &Format); - OperandMatchResultTy parseNumericFormat(int64_t &Format); - OperandMatchResultTy parseFlatOffset(OperandVector &Operands); - OperandMatchResultTy parseR128A16(OperandVector &Operands); + ParseStatus parseImm(OperandVector &Operands, bool HasSP3AbsModifier = false); + ParseStatus parseReg(OperandVector &Operands); + ParseStatus parseRegOrImm(OperandVector &Operands, bool HasSP3AbsMod = false); + ParseStatus parseRegOrImmWithFPInputMods(OperandVector &Operands, + bool AllowImm = true); + ParseStatus parseRegOrImmWithIntInputMods(OperandVector &Operands, + bool AllowImm = true); + ParseStatus parseRegWithFPInputMods(OperandVector &Operands); + ParseStatus parseRegWithIntInputMods(OperandVector &Operands); + ParseStatus parseVReg32OrOff(OperandVector &Operands); + ParseStatus parseDfmtNfmt(int64_t &Format); + ParseStatus parseUfmt(int64_t &Format); + ParseStatus parseSymbolicSplitFormat(StringRef FormatStr, SMLoc Loc, + int64_t &Format); + ParseStatus parseSymbolicUnifiedFormat(StringRef FormatStr, SMLoc Loc, + int64_t &Format); + ParseStatus parseFORMAT(OperandVector &Operands); + ParseStatus parseSymbolicOrNumericFormat(int64_t &Format); + ParseStatus parseNumericFormat(int64_t &Format); + ParseStatus parseFlatOffset(OperandVector &Operands); + ParseStatus parseR128A16(OperandVector &Operands); + ParseStatus parseBLGP(OperandVector &Operands); bool tryParseFmt(const char *Pref, int64_t MaxVal, int64_t &Val); bool matchDfmtNfmt(int64_t &Dfmt, int64_t &Nfmt, StringRef FormatStr, SMLoc Loc); - void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands); - void cvtDS(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, false); } - void cvtDSGds(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, true); } void cvtExp(MCInst &Inst, const OperandVector &Operands); bool parseCnt(int64_t &IntVal); - OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands); + ParseStatus parseSWaitCnt(OperandVector &Operands); bool parseDepCtr(int64_t &IntVal, unsigned &Mask); void depCtrError(SMLoc Loc, int ErrorId, StringRef DepCtrName); - OperandMatchResultTy parseDepCtrOps(OperandVector &Operands); + ParseStatus parseDepCtr(OperandVector &Operands); bool parseDelay(int64_t &Delay); - OperandMatchResultTy parseSDelayAluOps(OperandVector &Operands); + ParseStatus parseSDelayALU(OperandVector &Operands); - OperandMatchResultTy parseHwreg(OperandVector &Operands); + ParseStatus parseHwreg(OperandVector &Operands); private: struct OperandInfoTy { @@ -1648,7 +1627,7 @@ private: bool validateMIMGGatherDMask(const MCInst &Inst); bool validateMovrels(const MCInst &Inst, const OperandVector &Operands); bool validateMIMGDataSize(const MCInst &Inst, const SMLoc &IDLoc); - bool validateMIMGAddrSize(const MCInst &Inst); + bool validateMIMGAddrSize(const MCInst &Inst, const SMLoc &IDLoc); bool validateMIMGD16(const MCInst &Inst); bool validateMIMGMSAA(const MCInst &Inst); bool validateOpSel(const MCInst &Inst); @@ -1706,15 +1685,14 @@ private: public: void onBeginOfFile() override; - OperandMatchResultTy parseCustomOperand(OperandVector &Operands, - unsigned MCK); + ParseStatus parseCustomOperand(OperandVector &Operands, unsigned MCK); - OperandMatchResultTy parseExpTgt(OperandVector &Operands); - OperandMatchResultTy parseSendMsgOp(OperandVector &Operands); - OperandMatchResultTy parseInterpSlot(OperandVector &Operands); - OperandMatchResultTy parseInterpAttr(OperandVector &Operands); - OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands); - OperandMatchResultTy parseBoolReg(OperandVector &Operands); + ParseStatus parseExpTgt(OperandVector &Operands); + ParseStatus parseSendMsg(OperandVector &Operands); + ParseStatus parseInterpSlot(OperandVector &Operands); + ParseStatus parseInterpAttr(OperandVector &Operands); + ParseStatus parseSOPPBrTarget(OperandVector &Operands); + ParseStatus parseBoolReg(OperandVector &Operands); bool parseSwizzleOperand(int64_t &Op, const unsigned MinVal, @@ -1725,7 +1703,7 @@ public: const unsigned MinVal, const unsigned MaxVal, const StringRef ErrMsg); - OperandMatchResultTy parseSwizzleOp(OperandVector &Operands); + ParseStatus parseSwizzle(OperandVector &Operands); bool parseSwizzleOffset(int64_t &Imm); bool parseSwizzleMacro(int64_t &Imm); bool parseSwizzleQuadPerm(int64_t &Imm); @@ -1734,21 +1712,13 @@ public: bool parseSwizzleSwap(int64_t &Imm); bool parseSwizzleReverse(int64_t &Imm); - OperandMatchResultTy parseGPRIdxMode(OperandVector &Operands); + ParseStatus parseGPRIdxMode(OperandVector &Operands); int64_t parseGPRIdxMacro(); void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false); } void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true); } - void cvtMtbuf(MCInst &Inst, const OperandVector &Operands); - - AMDGPUOperand::Ptr defaultCPol() const; - AMDGPUOperand::Ptr defaultSMRDOffset8() const; - AMDGPUOperand::Ptr defaultSMEMOffset() const; - AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const; - AMDGPUOperand::Ptr defaultFlatOffset() const; - - OperandMatchResultTy parseOModOperand(OperandVector &Operands); + ParseStatus parseOModSI(OperandVector &Operands); void cvtVOP3(MCInst &Inst, const OperandVector &Operands, OptionalImmIndexMap &OptionalIdx); @@ -1763,25 +1733,16 @@ public: void cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands); void cvtVINTERP(MCInst &Inst, const OperandVector &Operands); - - void cvtMIMG(MCInst &Inst, const OperandVector &Operands, - bool IsAtomic = false); - void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands); - void cvtIntersectRay(MCInst &Inst, const OperandVector &Operands); - void cvtSMEMAtomic(MCInst &Inst, const OperandVector &Operands); bool parseDimId(unsigned &Encoding); - OperandMatchResultTy parseDim(OperandVector &Operands); - OperandMatchResultTy parseDPP8(OperandVector &Operands); - OperandMatchResultTy parseDPPCtrl(OperandVector &Operands); + ParseStatus parseDim(OperandVector &Operands); + bool convertDppBoundCtrl(int64_t &BoundCtrl); + ParseStatus parseDPP8(OperandVector &Operands); + ParseStatus parseDPPCtrl(OperandVector &Operands); bool isSupportedDPPCtrl(StringRef Ctrl, const OperandVector &Operands); int64_t parseDPPCtrlSel(StringRef Ctrl); int64_t parseDPPCtrlPerm(); - AMDGPUOperand::Ptr defaultRowMask() const; - AMDGPUOperand::Ptr defaultBankMask() const; - AMDGPUOperand::Ptr defaultDppBoundCtrl() const; - AMDGPUOperand::Ptr defaultFI() const; void cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8 = false); void cvtDPP8(MCInst &Inst, const OperandVector &Operands) { cvtDPP(Inst, Operands, true); @@ -1792,9 +1753,9 @@ public: cvtVOP3DPP(Inst, Operands, true); } - OperandMatchResultTy parseSDWASel(OperandVector &Operands, StringRef Prefix, - AMDGPUOperand::ImmTy Type); - OperandMatchResultTy parseSDWADstUnused(OperandVector &Operands); + ParseStatus parseSDWASel(OperandVector &Operands, StringRef Prefix, + AMDGPUOperand::ImmTy Type); + ParseStatus parseSDWADstUnused(OperandVector &Operands); void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands); void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands); void cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands); @@ -1805,16 +1766,9 @@ public: bool SkipDstVcc = false, bool SkipSrcVcc = false); - AMDGPUOperand::Ptr defaultBLGP() const; - AMDGPUOperand::Ptr defaultCBSZ() const; - AMDGPUOperand::Ptr defaultABID() const; - - OperandMatchResultTy parseEndpgmOp(OperandVector &Operands); - AMDGPUOperand::Ptr defaultEndpgmImmOperands() const; + ParseStatus parseEndpgm(OperandVector &Operands); - AMDGPUOperand::Ptr defaultWaitVDST() const; - AMDGPUOperand::Ptr defaultWaitEXP() const; - OperandMatchResultTy parseVOPD(OperandVector &Operands); + ParseStatus parseVOPD(OperandVector &Operands); }; } // end anonymous namespace @@ -2089,6 +2043,11 @@ uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const } void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const { + if (isExpr()) { + Inst.addOperand(MCOperand::createExpr(Expr)); + return; + } + if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()), Inst.getNumOperands())) { addLiteralImmOperand(Inst, Imm.Val, @@ -2285,24 +2244,6 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo } } -template <unsigned Bitwidth> -void AMDGPUOperand::addKImmFPOperands(MCInst &Inst, unsigned N) const { - APInt Literal(64, Imm.Val); - setImmKindMandatoryLiteral(); - - if (!Imm.IsFPImm) { - // We got int literal token. - Inst.addOperand(MCOperand::createImm(Literal.getLoBits(Bitwidth).getZExtValue())); - return; - } - - bool Lost; - APFloat FPLiteral(APFloat::IEEEdouble(), Literal); - FPLiteral.convert(*getFltSemantics(Bitwidth / 8), - APFloat::rmNearestTiesToEven, &Lost); - Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue())); -} - void AMDGPUOperand::addRegOperands(MCInst &Inst, unsigned N) const { Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), AsmParser->getSTI()))); } @@ -2922,12 +2863,12 @@ AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) { return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc); } -OperandMatchResultTy -AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) { +ParseStatus AMDGPUAsmParser::parseImm(OperandVector &Operands, + bool HasSP3AbsModifier) { // TODO: add syntactic sugar for 1/(2*PI) if (isRegister()) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; assert(!isModifier()); const auto& Tok = getToken(); @@ -2952,9 +2893,8 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) { APFloat RealVal(APFloat::IEEEdouble()); auto roundMode = APFloat::rmNearestTiesToEven; - if (errorToBool(RealVal.convertFromString(Num, roundMode).takeError())) { - return MatchOperand_ParseFail; - } + if (errorToBool(RealVal.convertFromString(Num, roundMode).takeError())) + return ParseStatus::Failure; if (Negate) RealVal.changeSign(); @@ -2962,7 +2902,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) { AMDGPUOperand::CreateImm(this, RealVal.bitcastToAPInt().getZExtValue(), S, AMDGPUOperand::ImmTyNone, true)); - return MatchOperand_Success; + return ParseStatus::Success; } else { int64_t IntVal; @@ -2979,10 +2919,10 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) { // MC expressions (due to the trailing '|'). SMLoc EndLoc; if (getParser().parsePrimaryExpr(Expr, EndLoc, nullptr)) - return MatchOperand_ParseFail; + return ParseStatus::Failure; } else { if (Parser.parseExpression(Expr)) - return MatchOperand_ParseFail; + return ParseStatus::Failure; } if (Expr->evaluateAsAbsolute(IntVal)) { @@ -2991,35 +2931,32 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) { Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S)); } - return MatchOperand_Success; + return ParseStatus::Success; } - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; } -OperandMatchResultTy -AMDGPUAsmParser::parseReg(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseReg(OperandVector &Operands) { if (!isRegister()) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; if (auto R = parseRegister()) { assert(R->isReg()); Operands.push_back(std::move(R)); - return MatchOperand_Success; + return ParseStatus::Success; } - return MatchOperand_ParseFail; + return ParseStatus::Failure; } -OperandMatchResultTy -AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands, bool HasSP3AbsMod) { - auto res = parseReg(Operands); - if (res != MatchOperand_NoMatch) { - return res; - } else if (isModifier()) { - return MatchOperand_NoMatch; - } else { - return parseImm(Operands, HasSP3AbsMod); - } +ParseStatus AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands, + bool HasSP3AbsMod) { + ParseStatus Res = parseReg(Operands); + if (!Res.isNoMatch()) + return Res; + if (isModifier()) + return ParseStatus::NoMatch; + return parseImm(Operands, HasSP3AbsMod); } bool @@ -3110,7 +3047,7 @@ AMDGPUAsmParser::parseSP3NegModifier() { return false; } -OperandMatchResultTy +ParseStatus AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm) { bool Neg, SP3Neg; @@ -3118,49 +3055,42 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, SMLoc Loc; // Disable ambiguous constructs like '--1' etc. Should use neg(-1) instead. - if (isToken(AsmToken::Minus) && peekToken().is(AsmToken::Minus)) { - Error(getLoc(), "invalid syntax, expected 'neg' modifier"); - return MatchOperand_ParseFail; - } + if (isToken(AsmToken::Minus) && peekToken().is(AsmToken::Minus)) + return Error(getLoc(), "invalid syntax, expected 'neg' modifier"); SP3Neg = parseSP3NegModifier(); Loc = getLoc(); Neg = trySkipId("neg"); - if (Neg && SP3Neg) { - Error(Loc, "expected register or immediate"); - return MatchOperand_ParseFail; - } + if (Neg && SP3Neg) + return Error(Loc, "expected register or immediate"); if (Neg && !skipToken(AsmToken::LParen, "expected left paren after neg")) - return MatchOperand_ParseFail; + return ParseStatus::Failure; Abs = trySkipId("abs"); if (Abs && !skipToken(AsmToken::LParen, "expected left paren after abs")) - return MatchOperand_ParseFail; + return ParseStatus::Failure; Loc = getLoc(); SP3Abs = trySkipToken(AsmToken::Pipe); - if (Abs && SP3Abs) { - Error(Loc, "expected register or immediate"); - return MatchOperand_ParseFail; - } + if (Abs && SP3Abs) + return Error(Loc, "expected register or immediate"); - OperandMatchResultTy Res; + ParseStatus Res; if (AllowImm) { Res = parseRegOrImm(Operands, SP3Abs); } else { Res = parseReg(Operands); } - if (Res != MatchOperand_Success) { - return (SP3Neg || Neg || SP3Abs || Abs)? MatchOperand_ParseFail : Res; - } + if (!Res.isSuccess()) + return (SP3Neg || Neg || SP3Abs || Abs) ? ParseStatus::Failure : Res; if (SP3Abs && !skipToken(AsmToken::Pipe, "expected vertical bar")) - return MatchOperand_ParseFail; + return ParseStatus::Failure; if (Abs && !skipToken(AsmToken::RParen, "expected closing parentheses")) - return MatchOperand_ParseFail; + return ParseStatus::Failure; if (Neg && !skipToken(AsmToken::RParen, "expected closing parentheses")) - return MatchOperand_ParseFail; + return ParseStatus::Failure; AMDGPUOperand::Modifiers Mods; Mods.Abs = Abs || SP3Abs; @@ -3168,79 +3098,71 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, if (Mods.hasFPModifiers()) { AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back()); - if (Op.isExpr()) { - Error(Op.getStartLoc(), "expected an absolute expression"); - return MatchOperand_ParseFail; - } + if (Op.isExpr()) + return Error(Op.getStartLoc(), "expected an absolute expression"); Op.setModifiers(Mods); } - return MatchOperand_Success; + return ParseStatus::Success; } -OperandMatchResultTy +ParseStatus AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm) { bool Sext = trySkipId("sext"); if (Sext && !skipToken(AsmToken::LParen, "expected left paren after sext")) - return MatchOperand_ParseFail; + return ParseStatus::Failure; - OperandMatchResultTy Res; + ParseStatus Res; if (AllowImm) { Res = parseRegOrImm(Operands); } else { Res = parseReg(Operands); } - if (Res != MatchOperand_Success) { - return Sext? MatchOperand_ParseFail : Res; - } + if (!Res.isSuccess()) + return Sext ? ParseStatus::Failure : Res; if (Sext && !skipToken(AsmToken::RParen, "expected closing parentheses")) - return MatchOperand_ParseFail; + return ParseStatus::Failure; AMDGPUOperand::Modifiers Mods; Mods.Sext = Sext; if (Mods.hasIntModifiers()) { AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back()); - if (Op.isExpr()) { - Error(Op.getStartLoc(), "expected an absolute expression"); - return MatchOperand_ParseFail; - } + if (Op.isExpr()) + return Error(Op.getStartLoc(), "expected an absolute expression"); Op.setModifiers(Mods); } - return MatchOperand_Success; + return ParseStatus::Success; } -OperandMatchResultTy -AMDGPUAsmParser::parseRegWithFPInputMods(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseRegWithFPInputMods(OperandVector &Operands) { return parseRegOrImmWithFPInputMods(Operands, false); } -OperandMatchResultTy -AMDGPUAsmParser::parseRegWithIntInputMods(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseRegWithIntInputMods(OperandVector &Operands) { return parseRegOrImmWithIntInputMods(Operands, false); } -OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) { auto Loc = getLoc(); if (trySkipId("off")) { Operands.push_back(AMDGPUOperand::CreateImm(this, 0, Loc, AMDGPUOperand::ImmTyOff, false)); - return MatchOperand_Success; + return ParseStatus::Success; } if (!isRegister()) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; std::unique_ptr<AMDGPUOperand> Reg = parseRegister(); if (Reg) { Operands.push_back(std::move(Reg)); - return MatchOperand_Success; + return ParseStatus::Success; } - return MatchOperand_ParseFail; - + return ParseStatus::Failure; } unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { @@ -3647,7 +3569,8 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst, return false; } -bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) { +bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst, + const SMLoc &IDLoc) { const unsigned Opc = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Opc); @@ -3667,8 +3590,13 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) { assert(SrsrcIdx != -1); assert(SrsrcIdx > VAddr0Idx); - if (DimIdx == -1) - return true; // intersect_ray + bool IsA16 = Inst.getOperand(A16Idx).getImm(); + if (BaseOpcode->BVH) { + if (IsA16 == BaseOpcode->A16) + return true; + Error(IDLoc, "image address size does not match a16"); + return false; + } unsigned Dim = Inst.getOperand(DimIdx).getImm(); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim); @@ -3676,12 +3604,19 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) { unsigned ActualAddrSize = IsNSA ? SrsrcIdx - VAddr0Idx : AMDGPU::getRegOperandSize(getMRI(), Desc, VAddr0Idx) / 4; - bool IsA16 = (A16Idx != -1 && Inst.getOperand(A16Idx).getImm()); unsigned ExpectedAddrSize = AMDGPU::getAddrSizeMIMGOp(BaseOpcode, DimInfo, IsA16, hasG16()); - if (!IsNSA) { + if (IsNSA) { + if (hasPartialNSAEncoding() && ExpectedAddrSize > getNSAMaxSize()) { + int VAddrLastIdx = SrsrcIdx - 1; + unsigned VAddrLastSize = + AMDGPU::getRegOperandSize(getMRI(), Desc, VAddrLastIdx) / 4; + + ActualAddrSize = VAddrLastIdx - VAddr0Idx + VAddrLastSize; + } + } else { if (ExpectedAddrSize > 12) ExpectedAddrSize = 16; @@ -3692,7 +3627,11 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) { return true; } - return ActualAddrSize == ExpectedAddrSize; + if (ActualAddrSize == ExpectedAddrSize) + return true; + + Error(IDLoc, "image address size does not match dim and a16"); + return false; } bool AMDGPUAsmParser::validateMIMGAtomicDMask(const MCInst &Inst) { @@ -4136,7 +4075,7 @@ SMLoc AMDGPUAsmParser::getSMEMOffsetLoc(const OperandVector &Operands) const { // Start with second operand because SMEM Offset cannot be dst or src0. for (unsigned i = 2, e = Operands.size(); i != e; ++i) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); - if (Op.isSMEMOffset()) + if (Op.isSMEMOffset() || Op.isSMEMOffsetMod()) return Op.getStartLoc(); } return getLoc(); @@ -4628,11 +4567,8 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, if (!validateMIMGDataSize(Inst, IDLoc)) { return false; } - if (!validateMIMGAddrSize(Inst)) { - Error(IDLoc, - "image address size does not match dim and a16"); + if (!validateMIMGAddrSize(Inst, IDLoc)) return false; - } if (!validateMIMGAtomicDMask(Inst)) { Error(getImmLoc(AMDGPUOperand::ImmTyDMask, Operands), "invalid atomic image dmask"); @@ -5242,10 +5178,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { #undef PARSE_BITS_ENTRY } - if (Seen.find(".amdhsa_next_free_vgpr") == Seen.end()) + if (!Seen.contains(".amdhsa_next_free_vgpr")) return TokError(".amdhsa_next_free_vgpr directive is required"); - if (Seen.find(".amdhsa_next_free_sgpr") == Seen.end()) + if (!Seen.contains(".amdhsa_next_free_sgpr")) return TokError(".amdhsa_next_free_sgpr directive is required"); unsigned VGPRBlocks; @@ -5283,7 +5219,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { UserSGPRCount); if (isGFX90A()) { - if (Seen.find(".amdhsa_accum_offset") == Seen.end()) + if (!Seen.contains(".amdhsa_accum_offset")) return TokError(".amdhsa_accum_offset directive is required"); if (AccumOffset < 4 || AccumOffset > 256 || (AccumOffset & 3)) return TokError("accum_offset should be in range [4..256] in " @@ -5294,9 +5230,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { (AccumOffset / 4 - 1)); } - if (IVersion.Major == 10) { + if (IVersion.Major >= 10) { // SharedVGPRCount < 16 checked by PARSE_ENTRY_BITS - if (SharedVGPRCount && EnableWavefrontSize32) { + if (SharedVGPRCount && EnableWavefrontSize32 && *EnableWavefrontSize32) { return TokError("shared_vgpr_count directive not valid on " "wavefront size 32"); } @@ -5309,7 +5245,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { getTargetStreamer().EmitAmdhsaKernelDescriptor( getSTI(), KernelName, KD, NextFreeVGPR, NextFreeSGPR, ReserveVCC, - ReserveFlatScr); + ReserveFlatScr, AMDGPU::getAmdhsaCodeObjectVersion()); return false; } @@ -5487,10 +5423,10 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() { const char *AssemblerDirectiveEnd; std::tie(AssemblerDirectiveBegin, AssemblerDirectiveEnd) = isHsaAbiVersion3AndAbove(&getSTI()) - ? std::tuple(HSAMD::V3::AssemblerDirectiveBegin, - HSAMD::V3::AssemblerDirectiveEnd) - : std::tuple(HSAMD::AssemblerDirectiveBegin, - HSAMD::AssemblerDirectiveEnd); + ? std::pair(HSAMD::V3::AssemblerDirectiveBegin, + HSAMD::V3::AssemblerDirectiveEnd) + : std::pair(HSAMD::AssemblerDirectiveBegin, + HSAMD::AssemblerDirectiveEnd); if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA) { return Error(getLoc(), @@ -5609,7 +5545,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() { return TokError("expected identifier in directive"); MCSymbol *Symbol = getContext().getOrCreateSymbol(Name); - if (parseToken(AsmToken::Comma, "expected ','")) + if (getParser().parseComma()) return true; unsigned LocalMemorySize = AMDGPU::IsaInfo::getLocalMemorySize(&getSTI()); @@ -5758,16 +5694,15 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, return true; } -OperandMatchResultTy -AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic, - OperandMode Mode) { - OperandMatchResultTy ResTy = parseVOPD(Operands); - if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail || - isToken(AsmToken::EndOfStatement)) - return ResTy; +ParseStatus AMDGPUAsmParser::parseOperand(OperandVector &Operands, + StringRef Mnemonic, + OperandMode Mode) { + ParseStatus Res = parseVOPD(Operands); + if (Res.isSuccess() || Res.isFailure() || isToken(AsmToken::EndOfStatement)) + return Res; // Try to parse with a custom parser - ResTy = MatchOperandParserImpl(Operands, Mnemonic); + Res = MatchOperandParserImpl(Operands, Mnemonic); // If we successfully parsed the operand or if there as an error parsing, // we are done. @@ -5775,9 +5710,8 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic, // If we are parsing after we reach EndOfStatement then this means we // are appending default values to the Operands list. This is only done // by custom parser, so we shouldn't continue on to the generic parsing. - if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail || - isToken(AsmToken::EndOfStatement)) - return ResTy; + if (Res.isSuccess() || Res.isFailure() || isToken(AsmToken::EndOfStatement)) + return Res; SMLoc RBraceLoc; SMLoc LBraceLoc = getLoc(); @@ -5786,20 +5720,19 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic, for (;;) { auto Loc = getLoc(); - ResTy = parseReg(Operands); - if (ResTy == MatchOperand_NoMatch) + Res = parseReg(Operands); + if (Res.isNoMatch()) Error(Loc, "expected a register"); - if (ResTy != MatchOperand_Success) - return MatchOperand_ParseFail; + if (!Res.isSuccess()) + return ParseStatus::Failure; RBraceLoc = getLoc(); if (trySkipToken(AsmToken::RBrac)) break; if (!skipToken(AsmToken::Comma, - "expected a comma or a closing square bracket")) { - return MatchOperand_ParseFail; - } + "expected a comma or a closing square bracket")) + return ParseStatus::Failure; } if (Operands.size() - Prefix > 1) { @@ -5808,7 +5741,7 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic, Operands.push_back(AMDGPUOperand::CreateToken(this, "]", RBraceLoc)); } - return MatchOperand_Success; + return ParseStatus::Success; } return parseRegOrImm(Operands); @@ -5862,15 +5795,14 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, OperandMode Mode = OperandMode_Default; if (IsMIMG && isGFX10Plus() && Operands.size() == 2) Mode = OperandMode_NSA; - OperandMatchResultTy Res = parseOperand(Operands, Name, Mode); + ParseStatus Res = parseOperand(Operands, Name, Mode); - if (Res != MatchOperand_Success) { + if (!Res.isSuccess()) { checkUnsupportedInstruction(Name, NameLoc); if (!Parser.hasPendingError()) { // FIXME: use real operand location rather than the current location. - StringRef Msg = - (Res == MatchOperand_ParseFail) ? "failed parsing operand." : - "not a valid operand."; + StringRef Msg = Res.isFailure() ? "failed parsing operand." + : "not a valid operand."; Error(getLoc(), Msg); } while (!trySkipToken(AsmToken::EndOfStatement)) { @@ -5890,34 +5822,33 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, // Utility functions //===----------------------------------------------------------------------===// -OperandMatchResultTy AMDGPUAsmParser::parseTokenOp(StringRef Name, - OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseTokenOp(StringRef Name, + OperandVector &Operands) { SMLoc S = getLoc(); if (!trySkipId(Name)) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; Operands.push_back(AMDGPUOperand::CreateToken(this, Name, S)); - return MatchOperand_Success; + return ParseStatus::Success; } -OperandMatchResultTy -AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &IntVal) { +ParseStatus AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, + int64_t &IntVal) { if (!trySkipId(Prefix, AsmToken::Colon)) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; - return parseExpr(IntVal) ? MatchOperand_Success : MatchOperand_ParseFail; + return parseExpr(IntVal) ? ParseStatus::Success : ParseStatus::Failure; } -OperandMatchResultTy -AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, - AMDGPUOperand::ImmTy ImmTy, - bool (*ConvertResult)(int64_t&)) { +ParseStatus AMDGPUAsmParser::parseIntWithPrefix( + const char *Prefix, OperandVector &Operands, AMDGPUOperand::ImmTy ImmTy, + std::function<bool(int64_t &)> ConvertResult) { SMLoc S = getLoc(); int64_t Value = 0; - OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Value); - if (Res != MatchOperand_Success) + ParseStatus Res = parseIntWithPrefix(Prefix, Value); + if (!Res.isSuccess()) return Res; if (ConvertResult && !ConvertResult(Value)) { @@ -5925,20 +5856,18 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, } Operands.push_back(AMDGPUOperand::CreateImm(this, Value, S, ImmTy)); - return MatchOperand_Success; + return ParseStatus::Success; } -OperandMatchResultTy -AMDGPUAsmParser::parseOperandArrayWithPrefix(const char *Prefix, - OperandVector &Operands, - AMDGPUOperand::ImmTy ImmTy, - bool (*ConvertResult)(int64_t&)) { +ParseStatus AMDGPUAsmParser::parseOperandArrayWithPrefix( + const char *Prefix, OperandVector &Operands, AMDGPUOperand::ImmTy ImmTy, + bool (*ConvertResult)(int64_t &)) { SMLoc S = getLoc(); if (!trySkipId(Prefix, AsmToken::Colon)) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; if (!skipToken(AsmToken::LBrac, "expected a left square bracket")) - return MatchOperand_ParseFail; + return ParseStatus::Failure; unsigned Val = 0; const unsigned MaxSize = 4; @@ -5949,34 +5878,30 @@ AMDGPUAsmParser::parseOperandArrayWithPrefix(const char *Prefix, int64_t Op; SMLoc Loc = getLoc(); if (!parseExpr(Op)) - return MatchOperand_ParseFail; + return ParseStatus::Failure; - if (Op != 0 && Op != 1) { - Error(Loc, "invalid " + StringRef(Prefix) + " value."); - return MatchOperand_ParseFail; - } + if (Op != 0 && Op != 1) + return Error(Loc, "invalid " + StringRef(Prefix) + " value."); Val |= (Op << I); if (trySkipToken(AsmToken::RBrac)) break; - if (I + 1 == MaxSize) { - Error(getLoc(), "expected a closing square bracket"); - return MatchOperand_ParseFail; - } + if (I + 1 == MaxSize) + return Error(getLoc(), "expected a closing square bracket"); if (!skipToken(AsmToken::Comma, "expected a comma")) - return MatchOperand_ParseFail; + return ParseStatus::Failure; } Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S, ImmTy)); - return MatchOperand_Success; + return ParseStatus::Success; } -OperandMatchResultTy -AMDGPUAsmParser::parseNamedBit(StringRef Name, OperandVector &Operands, - AMDGPUOperand::ImmTy ImmTy) { +ParseStatus AMDGPUAsmParser::parseNamedBit(StringRef Name, + OperandVector &Operands, + AMDGPUOperand::ImmTy ImmTy) { int64_t Bit; SMLoc S = getLoc(); @@ -5985,54 +5910,42 @@ AMDGPUAsmParser::parseNamedBit(StringRef Name, OperandVector &Operands, } else if (trySkipId("no", Name)) { Bit = 0; } else { - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; } - if (Name == "r128" && !hasMIMG_R128()) { - Error(S, "r128 modifier is not supported on this GPU"); - return MatchOperand_ParseFail; - } - if (Name == "a16" && !hasA16()) { - Error(S, "a16 modifier is not supported on this GPU"); - return MatchOperand_ParseFail; - } + if (Name == "r128" && !hasMIMG_R128()) + return Error(S, "r128 modifier is not supported on this GPU"); + if (Name == "a16" && !hasA16()) + return Error(S, "a16 modifier is not supported on this GPU"); if (isGFX9() && ImmTy == AMDGPUOperand::ImmTyA16) ImmTy = AMDGPUOperand::ImmTyR128A16; Operands.push_back(AMDGPUOperand::CreateImm(this, Bit, S, ImmTy)); - return MatchOperand_Success; + return ParseStatus::Success; } unsigned AMDGPUAsmParser::getCPolKind(StringRef Id, StringRef Mnemo, bool &Disabling) const { - Disabling = Id.startswith("no"); + Disabling = Id.consume_front("no"); if (isGFX940() && !Mnemo.startswith("s_")) { return StringSwitch<unsigned>(Id) .Case("nt", AMDGPU::CPol::NT) - .Case("nont", AMDGPU::CPol::NT) .Case("sc0", AMDGPU::CPol::SC0) - .Case("nosc0", AMDGPU::CPol::SC0) .Case("sc1", AMDGPU::CPol::SC1) - .Case("nosc1", AMDGPU::CPol::SC1) .Default(0); } return StringSwitch<unsigned>(Id) .Case("dlc", AMDGPU::CPol::DLC) - .Case("nodlc", AMDGPU::CPol::DLC) .Case("glc", AMDGPU::CPol::GLC) - .Case("noglc", AMDGPU::CPol::GLC) .Case("scc", AMDGPU::CPol::SCC) - .Case("noscc", AMDGPU::CPol::SCC) .Case("slc", AMDGPU::CPol::SLC) - .Case("noslc", AMDGPU::CPol::SLC) .Default(0); } -OperandMatchResultTy -AMDGPUAsmParser::parseCPol(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) { StringRef Mnemo = ((AMDGPUOperand &)*Operands[0]).getToken(); SMLoc OpLoc = getLoc(); unsigned Enabled = 0, Seen = 0; @@ -6045,20 +5958,14 @@ AMDGPUAsmParser::parseCPol(OperandVector &Operands) { lex(); - if (!isGFX10Plus() && CPol == AMDGPU::CPol::DLC) { - Error(S, "dlc modifier is not supported on this GPU"); - return MatchOperand_ParseFail; - } + if (!isGFX10Plus() && CPol == AMDGPU::CPol::DLC) + return Error(S, "dlc modifier is not supported on this GPU"); - if (!isGFX90A() && CPol == AMDGPU::CPol::SCC) { - Error(S, "scc modifier is not supported on this GPU"); - return MatchOperand_ParseFail; - } + if (!isGFX90A() && CPol == AMDGPU::CPol::SCC) + return Error(S, "scc modifier is not supported on this GPU"); - if (Seen & CPol) { - Error(S, "duplicate cache policy modifier"); - return MatchOperand_ParseFail; - } + if (Seen & CPol) + return Error(S, "duplicate cache policy modifier"); if (!Disabling) Enabled |= CPol; @@ -6067,11 +5974,11 @@ AMDGPUAsmParser::parseCPol(OperandVector &Operands) { } if (!Seen) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; Operands.push_back( AMDGPUOperand::CreateImm(this, Enabled, OpLoc, AMDGPUOperand::ImmTyCPol)); - return MatchOperand_Success; + return ParseStatus::Success; } static void addOptionalImmOperand( @@ -6088,16 +5995,15 @@ static void addOptionalImmOperand( } } -OperandMatchResultTy -AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix, - StringRef &Value, - SMLoc &StringLoc) { +ParseStatus AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix, + StringRef &Value, + SMLoc &StringLoc) { if (!trySkipId(Prefix, AsmToken::Colon)) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; StringLoc = getLoc(); - return parseId(Value, "expected an identifier") ? MatchOperand_Success - : MatchOperand_ParseFail; + return parseId(Value, "expected an identifier") ? ParseStatus::Success + : ParseStatus::Failure; } //===----------------------------------------------------------------------===// @@ -6111,9 +6017,9 @@ bool AMDGPUAsmParser::tryParseFmt(const char *Pref, SMLoc Loc = getLoc(); auto Res = parseIntWithPrefix(Pref, Val); - if (Res == MatchOperand_ParseFail) + if (Res.isFailure()) return false; - if (Res == MatchOperand_NoMatch) + if (Res.isNoMatch()) return true; if (Val < 0 || Val > MaxVal) { @@ -6127,8 +6033,7 @@ bool AMDGPUAsmParser::tryParseFmt(const char *Pref, // dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their // values to live in a joint format operand in the MCInst encoding. -OperandMatchResultTy -AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) { +ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) { using namespace llvm::AMDGPU::MTBUFFormat; int64_t Dfmt = DFMT_UNDEF; @@ -6137,11 +6042,11 @@ AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) { // dfmt and nfmt can appear in either order, and each is optional. for (int I = 0; I < 2; ++I) { if (Dfmt == DFMT_UNDEF && !tryParseFmt("dfmt", DFMT_MAX, Dfmt)) - return MatchOperand_ParseFail; + return ParseStatus::Failure; + + if (Nfmt == NFMT_UNDEF && !tryParseFmt("nfmt", NFMT_MAX, Nfmt)) + return ParseStatus::Failure; - if (Nfmt == NFMT_UNDEF && !tryParseFmt("nfmt", NFMT_MAX, Nfmt)) { - return MatchOperand_ParseFail; - } // Skip optional comma between dfmt/nfmt // but guard against 2 commas following each other. if ((Dfmt == DFMT_UNDEF) != (Nfmt == NFMT_UNDEF) && @@ -6151,29 +6056,28 @@ AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) { } if (Dfmt == DFMT_UNDEF && Nfmt == NFMT_UNDEF) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; Dfmt = (Dfmt == DFMT_UNDEF) ? DFMT_DEFAULT : Dfmt; Nfmt = (Nfmt == NFMT_UNDEF) ? NFMT_DEFAULT : Nfmt; Format = encodeDfmtNfmt(Dfmt, Nfmt); - return MatchOperand_Success; + return ParseStatus::Success; } -OperandMatchResultTy -AMDGPUAsmParser::parseUfmt(int64_t &Format) { +ParseStatus AMDGPUAsmParser::parseUfmt(int64_t &Format) { using namespace llvm::AMDGPU::MTBUFFormat; int64_t Fmt = UFMT_UNDEF; if (!tryParseFmt("format", UFMT_MAX, Fmt)) - return MatchOperand_ParseFail; + return ParseStatus::Failure; if (Fmt == UFMT_UNDEF) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; Format = Fmt; - return MatchOperand_Success; + return ParseStatus::Success; } bool AMDGPUAsmParser::matchDfmtNfmt(int64_t &Dfmt, @@ -6199,31 +6103,26 @@ bool AMDGPUAsmParser::matchDfmtNfmt(int64_t &Dfmt, return false; } -OperandMatchResultTy -AMDGPUAsmParser::parseSymbolicSplitFormat(StringRef FormatStr, - SMLoc FormatLoc, - int64_t &Format) { +ParseStatus AMDGPUAsmParser::parseSymbolicSplitFormat(StringRef FormatStr, + SMLoc FormatLoc, + int64_t &Format) { using namespace llvm::AMDGPU::MTBUFFormat; int64_t Dfmt = DFMT_UNDEF; int64_t Nfmt = NFMT_UNDEF; if (!matchDfmtNfmt(Dfmt, Nfmt, FormatStr, FormatLoc)) - return MatchOperand_ParseFail; + return ParseStatus::Failure; if (trySkipToken(AsmToken::Comma)) { StringRef Str; SMLoc Loc = getLoc(); if (!parseId(Str, "expected a format string") || - !matchDfmtNfmt(Dfmt, Nfmt, Str, Loc)) { - return MatchOperand_ParseFail; - } - if (Dfmt == DFMT_UNDEF) { - Error(Loc, "duplicate numeric format"); - return MatchOperand_ParseFail; - } else if (Nfmt == NFMT_UNDEF) { - Error(Loc, "duplicate data format"); - return MatchOperand_ParseFail; - } + !matchDfmtNfmt(Dfmt, Nfmt, Str, Loc)) + return ParseStatus::Failure; + if (Dfmt == DFMT_UNDEF) + return Error(Loc, "duplicate numeric format"); + if (Nfmt == NFMT_UNDEF) + return Error(Loc, "duplicate data format"); } Dfmt = (Dfmt == DFMT_UNDEF) ? DFMT_DEFAULT : Dfmt; @@ -6231,94 +6130,84 @@ AMDGPUAsmParser::parseSymbolicSplitFormat(StringRef FormatStr, if (isGFX10Plus()) { auto Ufmt = convertDfmtNfmt2Ufmt(Dfmt, Nfmt, getSTI()); - if (Ufmt == UFMT_UNDEF) { - Error(FormatLoc, "unsupported format"); - return MatchOperand_ParseFail; - } + if (Ufmt == UFMT_UNDEF) + return Error(FormatLoc, "unsupported format"); Format = Ufmt; } else { Format = encodeDfmtNfmt(Dfmt, Nfmt); } - return MatchOperand_Success; + return ParseStatus::Success; } -OperandMatchResultTy -AMDGPUAsmParser::parseSymbolicUnifiedFormat(StringRef FormatStr, - SMLoc Loc, - int64_t &Format) { +ParseStatus AMDGPUAsmParser::parseSymbolicUnifiedFormat(StringRef FormatStr, + SMLoc Loc, + int64_t &Format) { using namespace llvm::AMDGPU::MTBUFFormat; auto Id = getUnifiedFormat(FormatStr, getSTI()); if (Id == UFMT_UNDEF) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; - if (!isGFX10Plus()) { - Error(Loc, "unified format is not supported on this GPU"); - return MatchOperand_ParseFail; - } + if (!isGFX10Plus()) + return Error(Loc, "unified format is not supported on this GPU"); Format = Id; - return MatchOperand_Success; + return ParseStatus::Success; } -OperandMatchResultTy -AMDGPUAsmParser::parseNumericFormat(int64_t &Format) { +ParseStatus AMDGPUAsmParser::parseNumericFormat(int64_t &Format) { using namespace llvm::AMDGPU::MTBUFFormat; SMLoc Loc = getLoc(); if (!parseExpr(Format)) - return MatchOperand_ParseFail; - if (!isValidFormatEncoding(Format, getSTI())) { - Error(Loc, "out of range format"); - return MatchOperand_ParseFail; - } + return ParseStatus::Failure; + if (!isValidFormatEncoding(Format, getSTI())) + return Error(Loc, "out of range format"); - return MatchOperand_Success; + return ParseStatus::Success; } -OperandMatchResultTy -AMDGPUAsmParser::parseSymbolicOrNumericFormat(int64_t &Format) { +ParseStatus AMDGPUAsmParser::parseSymbolicOrNumericFormat(int64_t &Format) { using namespace llvm::AMDGPU::MTBUFFormat; if (!trySkipId("format", AsmToken::Colon)) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; if (trySkipToken(AsmToken::LBrac)) { StringRef FormatStr; SMLoc Loc = getLoc(); if (!parseId(FormatStr, "expected a format string")) - return MatchOperand_ParseFail; + return ParseStatus::Failure; auto Res = parseSymbolicUnifiedFormat(FormatStr, Loc, Format); - if (Res == MatchOperand_NoMatch) + if (Res.isNoMatch()) Res = parseSymbolicSplitFormat(FormatStr, Loc, Format); - if (Res != MatchOperand_Success) + if (!Res.isSuccess()) return Res; if (!skipToken(AsmToken::RBrac, "expected a closing square bracket")) - return MatchOperand_ParseFail; + return ParseStatus::Failure; - return MatchOperand_Success; + return ParseStatus::Success; } return parseNumericFormat(Format); } -OperandMatchResultTy -AMDGPUAsmParser::parseFORMAT(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseFORMAT(OperandVector &Operands) { using namespace llvm::AMDGPU::MTBUFFormat; int64_t Format = getDefaultFormatEncoding(getSTI()); - OperandMatchResultTy Res; + ParseStatus Res; SMLoc Loc = getLoc(); // Parse legacy format syntax. Res = isGFX10Plus() ? parseUfmt(Format) : parseDfmtNfmt(Format); - if (Res == MatchOperand_ParseFail) + if (Res.isFailure()) return Res; - bool FormatFound = (Res == MatchOperand_Success); + bool FormatFound = Res.isSuccess(); Operands.push_back( AMDGPUOperand::CreateImm(this, Format, Loc, AMDGPUOperand::ImmTyFORMAT)); @@ -6329,124 +6218,65 @@ AMDGPUAsmParser::parseFORMAT(OperandVector &Operands) { if (isToken(AsmToken::EndOfStatement)) { // We are expecting an soffset operand, // but let matcher handle the error. - return MatchOperand_Success; + return ParseStatus::Success; } // Parse soffset. Res = parseRegOrImm(Operands); - if (Res != MatchOperand_Success) + if (!Res.isSuccess()) return Res; trySkipToken(AsmToken::Comma); if (!FormatFound) { Res = parseSymbolicOrNumericFormat(Format); - if (Res == MatchOperand_ParseFail) + if (Res.isFailure()) return Res; - if (Res == MatchOperand_Success) { + if (Res.isSuccess()) { auto Size = Operands.size(); AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands[Size - 2]); assert(Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyFORMAT); Op.setImm(Format); } - return MatchOperand_Success; + return ParseStatus::Success; } - if (isId("format") && peekToken().is(AsmToken::Colon)) { - Error(getLoc(), "duplicate format"); - return MatchOperand_ParseFail; - } - return MatchOperand_Success; + if (isId("format") && peekToken().is(AsmToken::Colon)) + return Error(getLoc(), "duplicate format"); + return ParseStatus::Success; } -OperandMatchResultTy AMDGPUAsmParser::parseFlatOffset(OperandVector &Operands) { - OperandMatchResultTy Res = +ParseStatus AMDGPUAsmParser::parseFlatOffset(OperandVector &Operands) { + ParseStatus Res = parseIntWithPrefix("offset", Operands, AMDGPUOperand::ImmTyOffset); - if (Res == MatchOperand_NoMatch) { + if (Res.isNoMatch()) { Res = parseIntWithPrefix("inst_offset", Operands, AMDGPUOperand::ImmTyInstOffset); } return Res; } -OperandMatchResultTy AMDGPUAsmParser::parseR128A16(OperandVector &Operands) { - OperandMatchResultTy Res = +ParseStatus AMDGPUAsmParser::parseR128A16(OperandVector &Operands) { + ParseStatus Res = parseNamedBit("r128", Operands, AMDGPUOperand::ImmTyR128A16); - if (Res == MatchOperand_NoMatch) + if (Res.isNoMatch()) Res = parseNamedBit("a16", Operands, AMDGPUOperand::ImmTyA16); return Res; } -//===----------------------------------------------------------------------===// -// ds -//===----------------------------------------------------------------------===// - -void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst, - const OperandVector &Operands) { - OptionalImmIndexMap OptionalIdx; - - for (unsigned i = 1, e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); - - // Add the register arguments - if (Op.isReg()) { - Op.addRegOperands(Inst, 1); - continue; - } - - // Handle optional arguments - OptionalIdx[Op.getImmTy()] = i; +ParseStatus AMDGPUAsmParser::parseBLGP(OperandVector &Operands) { + ParseStatus Res = + parseIntWithPrefix("blgp", Operands, AMDGPUOperand::ImmTyBLGP); + if (Res.isNoMatch()) { + Res = + parseOperandArrayWithPrefix("neg", Operands, AMDGPUOperand::ImmTyBLGP); } - - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset0); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset1); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS); - - Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 + return Res; } -void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands, - bool IsGdsHardcoded) { - OptionalImmIndexMap OptionalIdx; - const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); - AMDGPUOperand::ImmTy OffsetType = AMDGPUOperand::ImmTyOffset; - - for (unsigned i = 1, e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); - - auto TiedTo = - Desc.getOperandConstraint(Inst.getNumOperands(), MCOI::TIED_TO); - - if (TiedTo != -1) { - assert((unsigned)TiedTo < Inst.getNumOperands()); - Inst.addOperand(Inst.getOperand(TiedTo)); - } - - // Add the register arguments - if (Op.isReg()) { - Op.addRegOperands(Inst, 1); - continue; - } - - if (Op.isToken() && Op.getToken() == "gds") { - IsGdsHardcoded = true; - continue; - } - - // Handle optional arguments - OptionalIdx[Op.getImmTy()] = i; - - if (Op.getImmTy() == AMDGPUOperand::ImmTySwizzle) - OffsetType = AMDGPUOperand::ImmTySwizzle; - } - - addOptionalImmOperand(Inst, Operands, OptionalIdx, OffsetType); - - if (!IsGdsHardcoded) { - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS); - } - Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 -} +//===----------------------------------------------------------------------===// +// Exp +//===----------------------------------------------------------------------===// void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) { OptionalImmIndexMap OptionalIdx; @@ -6583,8 +6413,7 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { return true; } -OperandMatchResultTy -AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseSWaitCnt(OperandVector &Operands) { AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); int64_t Waitcnt = getWaitcntBitMask(ISA); SMLoc S = getLoc(); @@ -6592,15 +6421,15 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { if (isToken(AsmToken::Identifier) && peekToken().is(AsmToken::LParen)) { while (!isToken(AsmToken::EndOfStatement)) { if (!parseCnt(Waitcnt)) - return MatchOperand_ParseFail; + return ParseStatus::Failure; } } else { if (!parseExpr(Waitcnt)) - return MatchOperand_ParseFail; + return ParseStatus::Failure; } Operands.push_back(AMDGPUOperand::CreateImm(this, Waitcnt, S)); - return MatchOperand_Success; + return ParseStatus::Success; } bool AMDGPUAsmParser::parseDelay(int64_t &Delay) { @@ -6665,23 +6494,22 @@ bool AMDGPUAsmParser::parseDelay(int64_t &Delay) { return true; } -OperandMatchResultTy -AMDGPUAsmParser::parseSDelayAluOps(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseSDelayALU(OperandVector &Operands) { int64_t Delay = 0; SMLoc S = getLoc(); if (isToken(AsmToken::Identifier) && peekToken().is(AsmToken::LParen)) { do { if (!parseDelay(Delay)) - return MatchOperand_ParseFail; + return ParseStatus::Failure; } while (trySkipToken(AsmToken::Pipe)); } else { if (!parseExpr(Delay)) - return MatchOperand_ParseFail; + return ParseStatus::Failure; } Operands.push_back(AMDGPUOperand::CreateImm(this, Delay, S)); - return MatchOperand_Success; + return ParseStatus::Success; } bool @@ -6689,7 +6517,7 @@ AMDGPUOperand::isSWaitCnt() const { return isImm(); } -bool AMDGPUOperand::isSDelayAlu() const { return isImm(); } +bool AMDGPUOperand::isSDelayALU() const { return isImm(); } //===----------------------------------------------------------------------===// // DepCtr @@ -6753,7 +6581,7 @@ bool AMDGPUAsmParser::parseDepCtr(int64_t &DepCtr, unsigned &UsedOprMask) { return true; } -OperandMatchResultTy AMDGPUAsmParser::parseDepCtrOps(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseDepCtr(OperandVector &Operands) { using namespace llvm::AMDGPU::DepCtr; int64_t DepCtr = getDefaultDepCtrEncoding(getSTI()); @@ -6763,15 +6591,15 @@ OperandMatchResultTy AMDGPUAsmParser::parseDepCtrOps(OperandVector &Operands) { unsigned UsedOprMask = 0; while (!isToken(AsmToken::EndOfStatement)) { if (!parseDepCtr(DepCtr, UsedOprMask)) - return MatchOperand_ParseFail; + return ParseStatus::Failure; } } else { if (!parseExpr(DepCtr)) - return MatchOperand_ParseFail; + return ParseStatus::Failure; } Operands.push_back(AMDGPUOperand::CreateImm(this, DepCtr, Loc)); - return MatchOperand_Success; + return ParseStatus::Success; } bool AMDGPUOperand::isDepCtr() const { return isS16Imm(); } @@ -6847,8 +6675,7 @@ AMDGPUAsmParser::validateHwreg(const OperandInfoTy &HwReg, return true; } -OperandMatchResultTy -AMDGPUAsmParser::parseHwreg(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseHwreg(OperandVector &Operands) { using namespace llvm::AMDGPU::Hwreg; int64_t ImmVal = 0; @@ -6862,19 +6689,17 @@ AMDGPUAsmParser::parseHwreg(OperandVector &Operands) { validateHwreg(HwReg, Offset, Width)) { ImmVal = encodeHwreg(HwReg.Id, Offset.Id, Width.Id); } else { - return MatchOperand_ParseFail; + return ParseStatus::Failure; } } else if (parseExpr(ImmVal, "a hwreg macro")) { - if (ImmVal < 0 || !isUInt<16>(ImmVal)) { - Error(Loc, "invalid immediate: only 16-bit values are legal"); - return MatchOperand_ParseFail; - } + if (ImmVal < 0 || !isUInt<16>(ImmVal)) + return Error(Loc, "invalid immediate: only 16-bit values are legal"); } else { - return MatchOperand_ParseFail; + return ParseStatus::Failure; } Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTyHwreg)); - return MatchOperand_Success; + return ParseStatus::Success; } bool AMDGPUOperand::isHwreg() const { @@ -6967,8 +6792,7 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg, return true; } -OperandMatchResultTy -AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseSendMsg(OperandVector &Operands) { using namespace llvm::AMDGPU::SendMsg; int64_t ImmVal = 0; @@ -6982,19 +6806,17 @@ AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) { validateSendMsg(Msg, Op, Stream)) { ImmVal = encodeMsg(Msg.Id, Op.Id, Stream.Id); } else { - return MatchOperand_ParseFail; + return ParseStatus::Failure; } } else if (parseExpr(ImmVal, "a sendmsg macro")) { - if (ImmVal < 0 || !isUInt<16>(ImmVal)) { - Error(Loc, "invalid immediate: only 16-bit values are legal"); - return MatchOperand_ParseFail; - } + if (ImmVal < 0 || !isUInt<16>(ImmVal)) + return Error(Loc, "invalid immediate: only 16-bit values are legal"); } else { - return MatchOperand_ParseFail; + return ParseStatus::Failure; } Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTySendMsg)); - return MatchOperand_Success; + return ParseStatus::Success; } bool AMDGPUOperand::isSendMsg() const { @@ -7005,12 +6827,12 @@ bool AMDGPUOperand::isSendMsg() const { // v_interp //===----------------------------------------------------------------------===// -OperandMatchResultTy AMDGPUAsmParser::parseInterpSlot(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseInterpSlot(OperandVector &Operands) { StringRef Str; SMLoc S = getLoc(); if (!parseId(Str)) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; int Slot = StringSwitch<int>(Str) .Case("p10", 0) @@ -7018,27 +6840,23 @@ OperandMatchResultTy AMDGPUAsmParser::parseInterpSlot(OperandVector &Operands) { .Case("p0", 2) .Default(-1); - if (Slot == -1) { - Error(S, "invalid interpolation slot"); - return MatchOperand_ParseFail; - } + if (Slot == -1) + return Error(S, "invalid interpolation slot"); Operands.push_back(AMDGPUOperand::CreateImm(this, Slot, S, AMDGPUOperand::ImmTyInterpSlot)); - return MatchOperand_Success; + return ParseStatus::Success; } -OperandMatchResultTy AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) { StringRef Str; SMLoc S = getLoc(); if (!parseId(Str)) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; - if (!Str.startswith("attr")) { - Error(S, "invalid interpolation attribute"); - return MatchOperand_ParseFail; - } + if (!Str.startswith("attr")) + return Error(S, "invalid interpolation attribute"); StringRef Chan = Str.take_back(2); int AttrChan = StringSwitch<int>(Chan) @@ -7047,57 +6865,49 @@ OperandMatchResultTy AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) { .Case(".z", 2) .Case(".w", 3) .Default(-1); - if (AttrChan == -1) { - Error(S, "invalid or missing interpolation attribute channel"); - return MatchOperand_ParseFail; - } + if (AttrChan == -1) + return Error(S, "invalid or missing interpolation attribute channel"); Str = Str.drop_back(2).drop_front(4); uint8_t Attr; - if (Str.getAsInteger(10, Attr)) { - Error(S, "invalid or missing interpolation attribute number"); - return MatchOperand_ParseFail; - } + if (Str.getAsInteger(10, Attr)) + return Error(S, "invalid or missing interpolation attribute number"); - if (Attr > 63) { - Error(S, "out of bounds interpolation attribute number"); - return MatchOperand_ParseFail; - } + if (Attr > 32) + return Error(S, "out of bounds interpolation attribute number"); SMLoc SChan = SMLoc::getFromPointer(Chan.data()); Operands.push_back(AMDGPUOperand::CreateImm(this, Attr, S, AMDGPUOperand::ImmTyInterpAttr)); - Operands.push_back(AMDGPUOperand::CreateImm(this, AttrChan, SChan, - AMDGPUOperand::ImmTyAttrChan)); - return MatchOperand_Success; + Operands.push_back(AMDGPUOperand::CreateImm( + this, AttrChan, SChan, AMDGPUOperand::ImmTyInterpAttrChan)); + return ParseStatus::Success; } //===----------------------------------------------------------------------===// // exp //===----------------------------------------------------------------------===// -OperandMatchResultTy AMDGPUAsmParser::parseExpTgt(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseExpTgt(OperandVector &Operands) { using namespace llvm::AMDGPU::Exp; StringRef Str; SMLoc S = getLoc(); if (!parseId(Str)) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; unsigned Id = getTgtId(Str); - if (Id == ET_INVALID || !isSupportedTgtId(Id, getSTI())) { - Error(S, (Id == ET_INVALID) ? - "invalid exp target" : - "exp target is not supported on this GPU"); - return MatchOperand_ParseFail; - } + if (Id == ET_INVALID || !isSupportedTgtId(Id, getSTI())) + return Error(S, (Id == ET_INVALID) + ? "invalid exp target" + : "exp target is not supported on this GPU"); Operands.push_back(AMDGPUOperand::CreateImm(this, Id, S, AMDGPUOperand::ImmTyExpTgt)); - return MatchOperand_Success; + return ParseStatus::Success; } //===----------------------------------------------------------------------===// @@ -7562,8 +7372,7 @@ AMDGPUAsmParser::parseSwizzleMacro(int64_t &Imm) { return false; } -OperandMatchResultTy -AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseSwizzle(OperandVector &Operands) { SMLoc S = getLoc(); int64_t Imm = 0; @@ -7580,9 +7389,9 @@ AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) { Operands.push_back(AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTySwizzle)); - return Ok ? MatchOperand_Success : MatchOperand_ParseFail; + return Ok ? ParseStatus::Success : ParseStatus::Failure; } - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; } bool @@ -7638,8 +7447,7 @@ int64_t AMDGPUAsmParser::parseGPRIdxMacro() { return Imm; } -OperandMatchResultTy -AMDGPUAsmParser::parseGPRIdxMode(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseGPRIdxMode(OperandVector &Operands) { using namespace llvm::AMDGPU::VGPRIndexMode; @@ -7649,19 +7457,17 @@ AMDGPUAsmParser::parseGPRIdxMode(OperandVector &Operands) { if (trySkipId("gpr_idx", AsmToken::LParen)) { Imm = parseGPRIdxMacro(); if (Imm == UNDEF) - return MatchOperand_ParseFail; + return ParseStatus::Failure; } else { if (getParser().parseAbsoluteExpression(Imm)) - return MatchOperand_ParseFail; - if (Imm < 0 || !isUInt<4>(Imm)) { - Error(S, "invalid immediate: only 4-bit values are legal"); - return MatchOperand_ParseFail; - } + return ParseStatus::Failure; + if (Imm < 0 || !isUInt<4>(Imm)) + return Error(S, "invalid immediate: only 4-bit values are legal"); } Operands.push_back( AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTyGprIdxMode)); - return MatchOperand_Success; + return ParseStatus::Success; } bool AMDGPUOperand::isGPRIdxMode() const { @@ -7672,17 +7478,16 @@ bool AMDGPUOperand::isGPRIdxMode() const { // sopp branch targets //===----------------------------------------------------------------------===// -OperandMatchResultTy -AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseSOPPBrTarget(OperandVector &Operands) { // Make sure we are not parsing something // that looks like a label or an expression but is not. // This will improve error messages. if (isRegister() || isModifier()) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; if (!parseExpr(Operands)) - return MatchOperand_ParseFail; + return ParseStatus::Failure; AMDGPUOperand &Opr = ((AMDGPUOperand &)*Operands[Operands.size() - 1]); assert(Opr.isImm() || Opr.isExpr()); @@ -7696,15 +7501,14 @@ AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) { Error(Loc, "expected a 16-bit signed jump offset"); } - return MatchOperand_Success; + return ParseStatus::Success; } //===----------------------------------------------------------------------===// // Boolean holding registers //===----------------------------------------------------------------------===// -OperandMatchResultTy -AMDGPUAsmParser::parseBoolReg(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseBoolReg(OperandVector &Operands) { return parseReg(Operands); } @@ -7712,10 +7516,6 @@ AMDGPUAsmParser::parseBoolReg(OperandVector &Operands) { // mubuf //===----------------------------------------------------------------------===// -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultCPol() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyCPol); -} - void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, bool IsAtomic) { @@ -7775,100 +7575,12 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySWZ); -} - -void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) { - OptionalImmIndexMap OptionalIdx; - - for (unsigned i = 1, e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); - - // Add the register arguments - if (Op.isReg()) { - Op.addRegOperands(Inst, 1); - continue; - } - - // Handle the case where soffset is an immediate - if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) { - Op.addImmOperands(Inst, 1); - continue; - } - - // Handle tokens like 'offen' which are sometimes hard-coded into the - // asm string. There are no MCInst operands for these. - if (Op.isToken()) { - continue; - } - assert(Op.isImm()); - - // Handle optional arguments - OptionalIdx[Op.getImmTy()] = i; - } - - addOptionalImmOperand(Inst, Operands, OptionalIdx, - AMDGPUOperand::ImmTyOffset); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyFORMAT); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySWZ); } //===----------------------------------------------------------------------===// -// mimg +// SMEM //===----------------------------------------------------------------------===// -void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands, - bool IsAtomic) { - unsigned I = 1; - const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); - for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { - ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); - } - - if (IsAtomic) { - // Add src, same as dst - assert(Desc.getNumDefs() == 1); - ((AMDGPUOperand &)*Operands[I - 1]).addRegOperands(Inst, 1); - } - - OptionalImmIndexMap OptionalIdx; - - for (unsigned E = Operands.size(); I != E; ++I) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); - - // Add the register arguments - if (Op.isReg()) { - Op.addRegOperands(Inst, 1); - } else if (Op.isImmModifier()) { - OptionalIdx[Op.getImmTy()] = I; - } else if (!Op.isToken()) { - llvm_unreachable("unexpected operand type"); - } - } - - bool IsGFX10Plus = isGFX10Plus(); - - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask); - if (IsGFX10Plus) - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDim, -1); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16); - if (IsGFX10Plus) - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyA16); - if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::tfe)) - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE); - if (!IsGFX10Plus) - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyD16); -} - -void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) { - cvtMIMG(Inst, Operands, true); -} - void AMDGPUAsmParser::cvtSMEMAtomic(MCInst &Inst, const OperandVector &Operands) { OptionalImmIndexMap OptionalIdx; bool IsAtomicReturn = false; @@ -7920,54 +7632,28 @@ void AMDGPUAsmParser::cvtSMEMAtomic(MCInst &Inst, const OperandVector &Operands) if ((int)Inst.getNumOperands() <= AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::offset)) - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); + addOptionalImmOperand(Inst, Operands, OptionalIdx, + AMDGPUOperand::ImmTySMEMOffsetMod); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0); } -void AMDGPUAsmParser::cvtIntersectRay(MCInst &Inst, - const OperandVector &Operands) { - for (unsigned I = 1; I < Operands.size(); ++I) { - auto &Operand = (AMDGPUOperand &)*Operands[I]; - if (Operand.isReg()) - Operand.addRegOperands(Inst, 1); - } - - Inst.addOperand(MCOperand::createImm(1)); // a16 -} - //===----------------------------------------------------------------------===// // smrd //===----------------------------------------------------------------------===// bool AMDGPUOperand::isSMRDOffset8() const { - return isImm() && isUInt<8>(getImm()); + return isImmLiteral() && isUInt<8>(getImm()); } bool AMDGPUOperand::isSMEMOffset() const { - return isImmTy(ImmTyNone) || - isImmTy(ImmTyOffset); // Offset range is checked later by validator. + // Offset range is checked later by validator. + return isImmLiteral(); } bool AMDGPUOperand::isSMRDLiteralOffset() const { // 32-bit literals are only supported on CI and we only want to use them // when the offset is > 8-bits. - return isImm() && !isUInt<8>(getImm()) && isUInt<32>(getImm()); -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset8() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMEMOffset() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDLiteralOffset() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFlatOffset() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset); + return isImmLiteral() && !isUInt<8>(getImm()) && isUInt<32>(getImm()); } //===----------------------------------------------------------------------===// @@ -7996,12 +7682,13 @@ static bool ConvertOmodDiv(int64_t &Div) { return false; } -// Both bound_ctrl:0 and bound_ctrl:1 are encoded as 1. +// For pre-gfx11 targets, both bound_ctrl:0 and bound_ctrl:1 are encoded as 1. // This is intentional and ensures compatibility with sp3. // See bug 35397 for details. -static bool ConvertDppBoundCtrl(int64_t &BoundCtrl) { +bool AMDGPUAsmParser::convertDppBoundCtrl(int64_t &BoundCtrl) { if (BoundCtrl == 0 || BoundCtrl == 1) { - BoundCtrl = 1; + if (!isGFX11Plus()) + BoundCtrl = 1; return true; } return false; @@ -8013,13 +7700,15 @@ void AMDGPUAsmParser::onBeginOfFile() { return; if (!getTargetStreamer().getTargetID()) - getTargetStreamer().initializeTargetID(getSTI(), getSTI().getFeatureString()); + getTargetStreamer().initializeTargetID(getSTI(), getSTI().getFeatureString(), + // TODO: Should try to check code object version from directive??? + AMDGPU::getAmdhsaCodeObjectVersion()); if (isHsaAbiVersion3AndAbove(&getSTI())) getTargetStreamer().EmitDirectiveAMDGCNTarget(); } -OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseOModSI(OperandVector &Operands) { StringRef Name = getTokenStr(); if (Name == "mul") { return parseIntWithPrefix("mul", Operands, @@ -8031,7 +7720,7 @@ OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands) AMDGPUOperand::ImmTyOModSI, ConvertOmodDiv); } - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; } // Determines which bit DST_OP_SEL occupies in the op_sel operand according to @@ -8100,9 +7789,8 @@ void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands) AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { Op.addRegOrImmWithFPInputModsOperands(Inst, 2); - } else if (Op.isInterpSlot() || - Op.isInterpAttr() || - Op.isAttrChan()) { + } else if (Op.isInterpSlot() || Op.isInterpAttr() || + Op.isInterpAttrChan()) { Inst.addOperand(MCOperand::createImm(Op.getImm())); } else if (Op.isImmModifier()) { OptionalIdx[Op.getImmTy()] = I; @@ -8335,9 +8023,9 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) { // VOPD //===----------------------------------------------------------------------===// -OperandMatchResultTy AMDGPUAsmParser::parseVOPD(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseVOPD(OperandVector &Operands) { if (!hasVOPD(getSTI())) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; if (isToken(AsmToken::Colon) && peekToken(false).is(AsmToken::Colon)) { SMLoc S = getLoc(); @@ -8348,12 +8036,11 @@ OperandMatchResultTy AMDGPUAsmParser::parseVOPD(OperandVector &Operands) { StringRef OpYName; if (isToken(AsmToken::Identifier) && !Parser.parseIdentifier(OpYName)) { Operands.push_back(AMDGPUOperand::CreateToken(this, OpYName, OpYLoc)); - return MatchOperand_Success; + return ParseStatus::Success; } - Error(OpYLoc, "expected a VOPDY instruction after ::"); - return MatchOperand_ParseFail; + return Error(OpYLoc, "expected a VOPDY instruction after ::"); } - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; } // Create VOPD MCInst operands using parsed assembler operands. @@ -8439,11 +8126,11 @@ bool AMDGPUOperand::isABID() const { } bool AMDGPUOperand::isS16Imm() const { - return isImm() && (isInt<16>(getImm()) || isUInt<16>(getImm())); + return isImmLiteral() && (isInt<16>(getImm()) || isUInt<16>(getImm())); } bool AMDGPUOperand::isU16Imm() const { - return isImm() && isUInt<16>(getImm()); + return isImmLiteral() && isUInt<16>(getImm()); } //===----------------------------------------------------------------------===// @@ -8479,66 +8166,62 @@ bool AMDGPUAsmParser::parseDimId(unsigned &Encoding) { return true; } -OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseDim(OperandVector &Operands) { if (!isGFX10Plus()) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; SMLoc S = getLoc(); if (!trySkipId("dim", AsmToken::Colon)) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; unsigned Encoding; SMLoc Loc = getLoc(); - if (!parseDimId(Encoding)) { - Error(Loc, "invalid dim value"); - return MatchOperand_ParseFail; - } + if (!parseDimId(Encoding)) + return Error(Loc, "invalid dim value"); Operands.push_back(AMDGPUOperand::CreateImm(this, Encoding, S, AMDGPUOperand::ImmTyDim)); - return MatchOperand_Success; + return ParseStatus::Success; } //===----------------------------------------------------------------------===// // dpp //===----------------------------------------------------------------------===// -OperandMatchResultTy AMDGPUAsmParser::parseDPP8(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseDPP8(OperandVector &Operands) { SMLoc S = getLoc(); if (!isGFX10Plus() || !trySkipId("dpp8", AsmToken::Colon)) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; // dpp8:[%d,%d,%d,%d,%d,%d,%d,%d] int64_t Sels[8]; if (!skipToken(AsmToken::LBrac, "expected an opening square bracket")) - return MatchOperand_ParseFail; + return ParseStatus::Failure; for (size_t i = 0; i < 8; ++i) { if (i > 0 && !skipToken(AsmToken::Comma, "expected a comma")) - return MatchOperand_ParseFail; + return ParseStatus::Failure; SMLoc Loc = getLoc(); if (getParser().parseAbsoluteExpression(Sels[i])) - return MatchOperand_ParseFail; - if (0 > Sels[i] || 7 < Sels[i]) { - Error(Loc, "expected a 3-bit value"); - return MatchOperand_ParseFail; - } + return ParseStatus::Failure; + if (0 > Sels[i] || 7 < Sels[i]) + return Error(Loc, "expected a 3-bit value"); } if (!skipToken(AsmToken::RBrac, "expected a closing square bracket")) - return MatchOperand_ParseFail; + return ParseStatus::Failure; unsigned DPP8 = 0; for (size_t i = 0; i < 8; ++i) DPP8 |= (Sels[i] << (i * 3)); Operands.push_back(AMDGPUOperand::CreateImm(this, DPP8, S, AMDGPUOperand::ImmTyDPP8)); - return MatchOperand_Success; + return ParseStatus::Success; } bool @@ -8644,13 +8327,12 @@ AMDGPUAsmParser::parseDPPCtrlSel(StringRef Ctrl) { return Val; } -OperandMatchResultTy -AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { using namespace AMDGPU::DPP; if (!isToken(AsmToken::Identifier) || !isSupportedDPPCtrl(getTokenStr(), Operands)) - return MatchOperand_NoMatch; + return ParseStatus::NoMatch; SMLoc S = getLoc(); int64_t Val = -1; @@ -8673,31 +8355,11 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { } if (Val == -1) - return MatchOperand_ParseFail; + return ParseStatus::Failure; Operands.push_back( AMDGPUOperand::CreateImm(this, Val, S, AMDGPUOperand::ImmTyDppCtrl)); - return MatchOperand_Success; -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultRowMask() const { - return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppRowMask); -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultEndpgmImmOperands() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyEndpgm); -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBankMask() const { - return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppBankMask); -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDppBoundCtrl() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppBoundCtrl); -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFI() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppFi); + return ParseStatus::Success; } void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, @@ -8744,7 +8406,7 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, } AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); // Add the register arguments - if (IsDPP8 && Op.isFI()) { + if (IsDPP8 && Op.isDppFI()) { Fi = Op.getImm(); } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { Op.addRegOrImmWithFPInputModsOperands(Inst, 2); @@ -8786,7 +8448,7 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::fi)) addOptionalImmOperand(Inst, Operands, OptionalIdx, - AMDGPUOperand::ImmTyDppFi); + AMDGPUOperand::ImmTyDppFI); } } @@ -8821,7 +8483,7 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I Op.addImmOperands(Inst, 1); } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { Op.addRegWithFPInputModsOperands(Inst, 2); - } else if (Op.isFI()) { + } else if (Op.isDppFI()) { Fi = Op.getImm(); } else if (Op.isReg()) { Op.addRegOperands(Inst, 1); @@ -8852,7 +8514,8 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl); if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::fi)) { - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppFi); + addOptionalImmOperand(Inst, Operands, OptionalIdx, + AMDGPUOperand::ImmTyDppFI); } } } @@ -8861,20 +8524,18 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I // sdwa //===----------------------------------------------------------------------===// -OperandMatchResultTy -AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix, - AMDGPUOperand::ImmTy Type) { +ParseStatus AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, + StringRef Prefix, + AMDGPUOperand::ImmTy Type) { using namespace llvm::AMDGPU::SDWA; SMLoc S = getLoc(); StringRef Value; - OperandMatchResultTy res; SMLoc StringLoc; - res = parseStringWithPrefix(Prefix, Value, StringLoc); - if (res != MatchOperand_Success) { - return res; - } + ParseStatus Res = parseStringWithPrefix(Prefix, Value, StringLoc); + if (!Res.isSuccess()) + return Res; int64_t Int; Int = StringSwitch<int64_t>(Value) @@ -8887,28 +8548,23 @@ AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix, .Case("DWORD", SdwaSel::DWORD) .Default(0xffffffff); - if (Int == 0xffffffff) { - Error(StringLoc, "invalid " + Twine(Prefix) + " value"); - return MatchOperand_ParseFail; - } + if (Int == 0xffffffff) + return Error(StringLoc, "invalid " + Twine(Prefix) + " value"); Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, Type)); - return MatchOperand_Success; + return ParseStatus::Success; } -OperandMatchResultTy -AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) { using namespace llvm::AMDGPU::SDWA; SMLoc S = getLoc(); StringRef Value; - OperandMatchResultTy res; SMLoc StringLoc; - res = parseStringWithPrefix("dst_unused", Value, StringLoc); - if (res != MatchOperand_Success) { - return res; - } + ParseStatus Res = parseStringWithPrefix("dst_unused", Value, StringLoc); + if (!Res.isSuccess()) + return Res; int64_t Int; Int = StringSwitch<int64_t>(Value) @@ -8917,13 +8573,11 @@ AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) { .Case("UNUSED_PRESERVE", DstUnused::UNUSED_PRESERVE) .Default(0xffffffff); - if (Int == 0xffffffff) { - Error(StringLoc, "invalid dst_unused value"); - return MatchOperand_ParseFail; - } + if (Int == 0xffffffff) + return Error(StringLoc, "invalid dst_unused value"); - Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, AMDGPUOperand::ImmTySdwaDstUnused)); - return MatchOperand_Success; + Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, AMDGPUOperand::ImmTySDWADstUnused)); + return ParseStatus::Success; } void AMDGPUAsmParser::cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands) { @@ -9009,14 +8663,14 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::dst_sel)) addOptionalImmOperand(Inst, Operands, OptionalIdx, - AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD); + AMDGPUOperand::ImmTySDWADstSel, SdwaSel::DWORD); if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::dst_unused)) addOptionalImmOperand(Inst, Operands, OptionalIdx, - AMDGPUOperand::ImmTySdwaDstUnused, + AMDGPUOperand::ImmTySDWADstUnused, DstUnused::UNUSED_PRESERVE); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySDWASrc0Sel, SdwaSel::DWORD); break; case SIInstrFlags::VOP2: @@ -9025,17 +8679,17 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::omod)) addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySDWADstSel, SdwaSel::DWORD); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySDWADstUnused, DstUnused::UNUSED_PRESERVE); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySDWASrc0Sel, SdwaSel::DWORD); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySDWASrc1Sel, SdwaSel::DWORD); break; case SIInstrFlags::VOPC: if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::clamp)) addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySDWASrc0Sel, SdwaSel::DWORD); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySDWASrc1Sel, SdwaSel::DWORD); break; default: @@ -9054,25 +8708,9 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, } } -//===----------------------------------------------------------------------===// -// mAI -//===----------------------------------------------------------------------===// - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBLGP() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyBLGP); -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultCBSZ() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyCBSZ); -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultABID() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyABID); -} - /// Force static initialization. extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmParser() { - RegisterMCAsmParser<AMDGPUAsmParser> A(getTheAMDGPUTarget()); + RegisterMCAsmParser<AMDGPUAsmParser> A(getTheR600Target()); RegisterMCAsmParser<AMDGPUAsmParser> B(getTheGCNTarget()); } @@ -9082,8 +8720,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmParser() { #define GET_MNEMONIC_CHECKER #include "AMDGPUGenAsmMatcher.inc" -OperandMatchResultTy -AMDGPUAsmParser::parseCustomOperand(OperandVector &Operands, unsigned MCK) { +ParseStatus AMDGPUAsmParser::parseCustomOperand(OperandVector &Operands, + unsigned MCK) { switch (MCK) { case MCK_addr64: return parseTokenOp("addr64", Operands); @@ -9099,55 +8737,8 @@ AMDGPUAsmParser::parseCustomOperand(OperandVector &Operands, unsigned MCK) { return parseTokenOp("off", Operands); case MCK_row_95_en: return parseTokenOp("row_en", Operands); - case MCK_ImmABID: - return parseIntWithPrefix("abid", Operands, AMDGPUOperand::ImmTyABID); - case MCK_ImmBankMask: - return parseIntWithPrefix("bank_mask", Operands, - AMDGPUOperand::ImmTyDppBankMask); - case MCK_ImmBLGP: { - OperandMatchResultTy Res = - parseIntWithPrefix("blgp", Operands, AMDGPUOperand::ImmTyBLGP); - if (Res == MatchOperand_NoMatch) { - Res = parseOperandArrayWithPrefix("neg", Operands, - AMDGPUOperand::ImmTyBLGP); - } - return Res; - } - case MCK_ImmCBSZ: - return parseIntWithPrefix("cbsz", Operands, AMDGPUOperand::ImmTyCBSZ); - case MCK_ImmCPol: - return parseCPol(Operands); - case MCK_ImmFI: - return parseIntWithPrefix("fi", Operands, AMDGPUOperand::ImmTyDppFi); case MCK_gds: return parseNamedBit("gds", Operands, AMDGPUOperand::ImmTyGDS); - case MCK_ImmNegHi: - return parseOperandArrayWithPrefix("neg_hi", Operands, - AMDGPUOperand::ImmTyNegHi); - case MCK_ImmNegLo: - return parseOperandArrayWithPrefix("neg_lo", Operands, - AMDGPUOperand::ImmTyNegLo); - case MCK_ImmSMEMOffset: - return parseIntWithPrefix("offset", Operands, AMDGPUOperand::ImmTyOffset); - case MCK_ImmOModSI: - return parseOModOperand(Operands); - case MCK_ImmOpSel: - return parseOperandArrayWithPrefix("op_sel", Operands, - AMDGPUOperand::ImmTyOpSel); - case MCK_ImmOpSelHi: - return parseOperandArrayWithPrefix("op_sel_hi", Operands, - AMDGPUOperand::ImmTyOpSelHi); - case MCK_ImmRowMask: - return parseIntWithPrefix("row_mask", Operands, - AMDGPUOperand::ImmTyDppRowMask); - case MCK_ImmSDWADstSel: - return parseSDWASel(Operands, "dst_sel", AMDGPUOperand::ImmTySdwaDstSel); - case MCK_ImmSDWADstUnused: - return parseSDWADstUnused(Operands); - case MCK_ImmSDWASrc0Sel: - return parseSDWASel(Operands, "src0_sel", AMDGPUOperand::ImmTySdwaSrc0Sel); - case MCK_ImmSDWASrc1Sel: - return parseSDWASel(Operands, "src1_sel", AMDGPUOperand::ImmTySdwaSrc1Sel); case MCK_tfe: return parseNamedBit("tfe", Operands, AMDGPUOperand::ImmTyTFE); } @@ -9186,18 +8777,16 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op, return Operand.isSSrcB32() ? Match_Success : Match_InvalidOperand; case MCK_SSrcF32: return Operand.isSSrcF32() ? Match_Success : Match_InvalidOperand; - case MCK_SoppBrTarget: - return Operand.isSoppBrTarget() ? Match_Success : Match_InvalidOperand; + case MCK_SOPPBrTarget: + return Operand.isSOPPBrTarget() ? Match_Success : Match_InvalidOperand; case MCK_VReg32OrOff: return Operand.isVReg32OrOff() ? Match_Success : Match_InvalidOperand; case MCK_InterpSlot: return Operand.isInterpSlot() ? Match_Success : Match_InvalidOperand; - case MCK_Attr: + case MCK_InterpAttr: return Operand.isInterpAttr() ? Match_Success : Match_InvalidOperand; - case MCK_AttrChan: - return Operand.isAttrChan() ? Match_Success : Match_InvalidOperand; - case MCK_ImmSMEMOffset: - return Operand.isSMEMOffset() ? Match_Success : Match_InvalidOperand; + case MCK_InterpAttrChan: + return Operand.isInterpAttrChan() ? Match_Success : Match_InvalidOperand; case MCK_SReg_64: case MCK_SReg_64_XEXEC: // Null is defined as a 32-bit register but @@ -9215,7 +8804,7 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op, // endpgm //===----------------------------------------------------------------------===// -OperandMatchResultTy AMDGPUAsmParser::parseEndpgmOp(OperandVector &Operands) { +ParseStatus AMDGPUAsmParser::parseEndpgm(OperandVector &Operands) { SMLoc S = getLoc(); int64_t Imm = 0; @@ -9224,14 +8813,12 @@ OperandMatchResultTy AMDGPUAsmParser::parseEndpgmOp(OperandVector &Operands) { Imm = 0; } - if (!isUInt<16>(Imm)) { - Error(S, "expected a 16-bit value"); - return MatchOperand_ParseFail; - } + if (!isUInt<16>(Imm)) + return Error(S, "expected a 16-bit value"); Operands.push_back( AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTyEndpgm)); - return MatchOperand_Success; + return ParseStatus::Success; } bool AMDGPUOperand::isEndpgm() const { return isImmTy(ImmTyEndpgm); } @@ -9240,10 +8827,6 @@ bool AMDGPUOperand::isEndpgm() const { return isImmTy(ImmTyEndpgm); } // LDSDIR //===----------------------------------------------------------------------===// -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultWaitVDST() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyWaitVDST); -} - bool AMDGPUOperand::isWaitVDST() const { return isImmTy(ImmTyWaitVDST) && isUInt<4>(getImm()); } @@ -9252,10 +8835,6 @@ bool AMDGPUOperand::isWaitVDST() const { // VINTERP //===----------------------------------------------------------------------===// -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultWaitEXP() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyWaitEXP); -} - bool AMDGPUOperand::isWaitEXP() const { return isImmTy(ImmTyWaitEXP) && isUInt<3>(getImm()); } diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index bd7f088c76e3..ea1578e30ae8 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -110,7 +110,6 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins, Instruction BaseOpcode = !cast<Instruction>(MTBUFGetBaseOpcode<NAME>.ret); let MTBUF = 1; - let AsmMatchConverter = "cvtMtbuf"; } class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> : @@ -158,7 +157,7 @@ class getMTBUFInsDA<list<RegisterClass> vdataList, RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret; - dag NonVaddrInputs = (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, CPol:$cpol, SWZ:$swz); + dag NonVaddrInputs = (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, CPol:$cpol, i1imm:$swz); dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, !con((ins vaddrClass:$vaddr), NonVaddrInputs)); dag ret = !if(!empty(vdataList), Inputs, !con((ins vdata_op:$vdata), Inputs)); } @@ -186,7 +185,7 @@ class getMTBUFAsmOps<int addrKind> { !if(!eq(addrKind, BUFAddrKind.Addr64), "$vaddr, $srsrc,$format $soffset addr64", ""))))); - string ret = " $vdata, " # Pfx # "$offset$cpol$swz"; + string ret = " $vdata, " # Pfx # "$offset$cpol"; } class MTBUF_SetupAddr<int addrKind> { @@ -387,7 +386,7 @@ class getMUBUFInsDA<list<RegisterClass> vdataList, RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdataClass, isTFE>.ret; - dag NonVaddrInputs = (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol, SWZ_0:$swz); + dag NonVaddrInputs = (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol, i1imm_0:$swz); dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, !con((ins vaddrClass:$vaddr), NonVaddrInputs)); dag ret = !if(!empty(vdataList), Inputs, !con((ins vdata_op:$vdata), Inputs)); } @@ -421,7 +420,7 @@ class getMUBUFIns<int addrKind, list<RegisterClass> vdataList, bit isTFE> { (ins)))))); } -class getMUBUFAsmOps<int addrKind, bit noVdata = 0, bit isLds = 0, bit isTFE = 0, bit isSwz = 0> { +class getMUBUFAsmOps<int addrKind, bit noVdata = 0, bit isLds = 0, bit isTFE = 0> { string Vdata = !if(noVdata, " ", " $vdata, "); string Lds = !if(isLds, " lds", ""); string TFE = !if(isTFE, " tfe", ""); @@ -434,9 +433,8 @@ class getMUBUFAsmOps<int addrKind, bit noVdata = 0, bit isLds = 0, bit isTFE = 0 ""))))); string Offset = "$offset"; string OtherArgs = "$cpol"; - string Swz = !if(isSwz, "$swz", ""); - string ret = Vdata # MainArgs # Offset # OtherArgs # Lds # TFE # Swz; + string ret = Vdata # MainArgs # Offset # OtherArgs # Lds # TFE; } class MUBUF_SetupAddr<int addrKind> { @@ -467,7 +465,7 @@ class MUBUF_Load_Pseudo <string opName, !if(!or(isLds, isLdsOpc), (outs), (outs vdata_op:$vdata)), !con(getMUBUFIns<addrKindCopy, [], isTFE>.ret, !if(HasTiedDest, (ins vdata_op:$vdata_in), (ins))), - getMUBUFAsmOps<addrKindCopy, !or(isLds, isLdsOpc), isLds, isTFE, 1>.ret, + getMUBUFAsmOps<addrKindCopy, !or(isLds, isLdsOpc), isLds, isTFE>.ret, pattern>, MUBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # !if(isLds, "_lds", "") # !if(isTFE, "_tfe", "") # @@ -488,15 +486,15 @@ class MUBUF_Load_Pseudo <string opName, } class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat < - (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset))), - (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset)) + (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset))), + (load_vt (inst v4i32:$srsrc, i32:$soffset, i32:$offset)) >; class MUBUF_Addr64_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat < - (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset))), - (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset)) + (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset))), + (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i32:$offset)) >; multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> { @@ -562,7 +560,7 @@ class MUBUF_Store_Pseudo <string opName, : MUBUF_Pseudo<opName, (outs), getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret], isTFE>.ret, - getMUBUFAsmOps<addrKindCopy, 0, 0, isTFE, 1>.ret, + getMUBUFAsmOps<addrKindCopy, 0, 0, isTFE>.ret, pattern>, MUBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # "_" # !if(isTFE, "_tfe", "") # @@ -580,12 +578,12 @@ multiclass MUBUF_Pseudo_Stores_Helper<string opName, ValueType store_vt, def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, legal_store_vt, isTFE, [(st legal_store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset))]>, + i32:$offset))]>, MUBUFAddr64Table<0, NAME>; def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, legal_store_vt, isTFE, [(st legal_store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset))]>, + i32:$offset))]>, MUBUFAddr64Table<1, NAME>; def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, legal_store_vt, isTFE>; @@ -609,8 +607,8 @@ multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32, class MUBUF_Pseudo_Store_Lds<string opName> : MUBUF_Pseudo<opName, (outs), - (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol:$cpol, SWZ:$swz), - " $srsrc, $soffset$offset lds$cpol$swz"> { + (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol:$cpol, i1imm:$swz), + " $srsrc, $soffset$offset lds$cpol"> { let LGKM_CNT = 1; let mayLoad = 1; let mayStore = 1; @@ -635,7 +633,7 @@ class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in, dag MainInputs = (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset); dag CPol = !if(vdata_in, (ins CPol_GLC1:$cpol), (ins CPol_0:$cpol)); - dag ret = !con(Data, !con(MainInputs, CPol)); + dag ret = !con(Data, MainInputs, CPol); } class getMUBUFAtomicIns<int addrKind, @@ -724,23 +722,15 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName, RegisterClass vdataClass, ValueType vdataType, bit isFP = isFloatType<vdataType>.ret> { - let FPAtomic = isFP in - def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>, - MUBUFAddr64Table <0, NAME>; - - let FPAtomic = isFP in - def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>, - MUBUFAddr64Table <1, NAME>; - - let FPAtomic = isFP in - def _OFFEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - - let FPAtomic = isFP in - - def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - - let FPAtomic = isFP in - def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + let FPAtomic = isFP in { + def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>, + MUBUFAddr64Table <0, NAME>; + def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>, + MUBUFAddr64Table <1, NAME>; + def _OFFEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + } } multiclass MUBUF_Pseudo_Atomics_RTN <string opName, @@ -748,28 +738,23 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName, ValueType vdataType, SDPatternOperator atomic, bit isFP = isFloatType<vdataType>.ret> { - let FPAtomic = isFP in - def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, - [(set vdataType:$vdata, - (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset), - vdataType:$vdata_in))]>, - MUBUFAddr64Table <0, NAME # "_RTN">; - - let FPAtomic = isFP in - def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, - [(set vdataType:$vdata, - (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset), - vdataType:$vdata_in))]>, - MUBUFAddr64Table <1, NAME # "_RTN">; - - let FPAtomic = isFP in - def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + let FPAtomic = isFP in { + def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, + [(set vdataType:$vdata, + (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), + vdataType:$vdata_in))]>, + MUBUFAddr64Table <0, NAME # "_RTN">; - let FPAtomic = isFP in - def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, + [(set vdataType:$vdata, + (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset), + vdataType:$vdata_in))]>, + MUBUFAddr64Table <1, NAME # "_RTN">; - let FPAtomic = isFP in - def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + } } multiclass MUBUF_Pseudo_Atomics <string opName, @@ -1124,7 +1109,7 @@ defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN< "buffer_atomic_add_f32", VGPR_32, f32 >; -let SubtargetPredicate = HasAtomicPkFaddNoRtnInsts in +let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < "buffer_atomic_pk_add_f16", VGPR_32, v2f16 >; @@ -1134,7 +1119,7 @@ defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN< "buffer_atomic_add_f32", VGPR_32, f32, null_frag >; -let OtherPredicates = [isGFX90APlus] in +let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN < "buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag >; @@ -1233,21 +1218,21 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, def : GCNPat< (vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$auxiliary, 0)), - (!cast<MUBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (!cast<MUBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; def : GCNPat< (vt (st v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, timm:$auxiliary, 0)), - (!cast<MUBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (!cast<MUBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; def : GCNPat< (vt (st v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, timm:$auxiliary, timm)), - (!cast<MUBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (!cast<MUBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; @@ -1256,7 +1241,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, timm:$auxiliary, timm)), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; } @@ -1320,7 +1305,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, def : GCNPat< (st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$auxiliary, 0), - (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; @@ -1328,14 +1313,14 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, timm:$auxiliary, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (extract_cpol $auxiliary), (extract_swz $auxiliary)) + timm:$offset, (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; def : GCNPat< (st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, timm:$auxiliary, timm), (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (extract_cpol $auxiliary), (extract_swz $auxiliary)) + timm:$offset, (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; def : GCNPat< @@ -1344,7 +1329,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact) getVregSrcForVT<vt>.ret:$vdata, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_cpol $auxiliary), + SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; } @@ -1408,13 +1393,13 @@ multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isInt let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in { def : GCNPat< - (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset), vt:$vdata_in)), + (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), vt:$vdata_in)), (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset) >; def : GCNPat< - (vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset), + (vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset), vt:$vdata_in)), (!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset) @@ -1441,7 +1426,7 @@ multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> getVregSrcForVT<data_vt>.ret:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset); def : GCNPat< - (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset), data_vt:$vdata_in)), + (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), data_vt:$vdata_in)), !if(!eq(RtnMode, "ret"), (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS OffsetResDag, getVregSrcForVT<data_vt>.ret)), !if(!eq(vt, i32), sub0, sub0_sub1)), @@ -1452,7 +1437,7 @@ multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> getVregSrcForVT<data_vt>.ret:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset); def : GCNPat< - (vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset), + (vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset), data_vt:$vdata_in)), !if(!eq(RtnMode, "ret"), (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS Addr64ResDag, getVregSrcForVT<data_vt>.ret)), @@ -1478,8 +1463,8 @@ defm : BufferAtomicPat<"atomic_load_umax_global", Ty, "BUFFER_ATOMIC_UMAX" # Suf defm : BufferAtomicPat<"atomic_load_and_global", Ty, "BUFFER_ATOMIC_AND" # Suffix>; defm : BufferAtomicPat<"atomic_load_or_global", Ty, "BUFFER_ATOMIC_OR" # Suffix>; defm : BufferAtomicPat<"atomic_load_xor_global", Ty, "BUFFER_ATOMIC_XOR" # Suffix>; -defm : BufferAtomicPat<"atomic_inc_global", Ty, "BUFFER_ATOMIC_INC" # Suffix>; -defm : BufferAtomicPat<"atomic_dec_global", Ty, "BUFFER_ATOMIC_DEC" # Suffix>; +defm : BufferAtomicPat<"atomic_load_uinc_wrap_global", Ty, "BUFFER_ATOMIC_INC" # Suffix>; +defm : BufferAtomicPat<"atomic_load_udec_wrap_global", Ty, "BUFFER_ATOMIC_DEC" # Suffix>; } // end foreach Ty @@ -1503,7 +1488,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, timm:$offset, timm:$cachepolicy, 0)), (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), CachePolicy) + timm:$offset, CachePolicy) >; def : GCNPat< @@ -1511,7 +1496,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, timm:$offset, timm:$cachepolicy, timm)), (!cast<MUBUF_Pseudo>(Inst # "_IDXEN" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, - SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy) + SCSrc_b32:$soffset, timm:$offset, CachePolicy) >; def : GCNPat< @@ -1519,7 +1504,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, i32:$soffset, timm:$offset, timm:$cachepolicy, 0)), (!cast<MUBUF_Pseudo>(Inst # "_OFFEN" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, - SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy) + SCSrc_b32:$soffset, timm:$offset, CachePolicy) >; def : GCNPat< @@ -1528,7 +1513,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst, (!cast<MUBUF_Pseudo>(Inst # "_BOTHEN" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy) + SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy) >; } // end let AddedComplexity @@ -1584,7 +1569,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, 0, i32:$soffset, timm:$offset, timm:$cachepolicy, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFSET) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), timm:$cachepolicy) + timm:$offset, timm:$cachepolicy) >; def : GCNPat< @@ -1592,7 +1577,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, 0, i32:$soffset, timm:$offset, timm:$cachepolicy, timm), (!cast<MUBUF_Pseudo>(opcode # _IDXEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), timm:$cachepolicy) + timm:$offset, timm:$cachepolicy) >; def : GCNPat< @@ -1600,7 +1585,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, i32:$voffset, i32:$soffset, timm:$offset, timm:$cachepolicy, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), timm:$cachepolicy) + timm:$offset, timm:$cachepolicy) >; def : GCNPat< @@ -1610,22 +1595,23 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt, (!cast<MUBUF_Pseudo>(opcode # _BOTHEN) getVregSrcForVT<vt>.ret:$vdata_in, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), timm:$cachepolicy) + SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, timm:$cachepolicy) >; } let SubtargetPredicate = HasAtomicFaddNoRtnInsts in defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["noret"]>; -let SubtargetPredicate = HasAtomicPkFaddNoRtnInsts in +let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>; let SubtargetPredicate = HasAtomicFaddRtnInsts in defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["ret"]>; -let SubtargetPredicate = isGFX90APlus in { - defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>; +let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in +defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>; +let SubtargetPredicate = isGFX90APlus in { defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">; defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">; @@ -1641,7 +1627,7 @@ defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy), defvar OffsetResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_OFFSET" # InstSuffix) (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy); + SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy); def : GCNPat< (Op i32:$data, i32:$cmp, v4i32:$rsrc, 0, 0, i32:$soffset, @@ -1653,7 +1639,7 @@ def : GCNPat< defvar IdxenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_IDXEN" # InstSuffix) (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), - VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy); def : GCNPat< (Op @@ -1667,7 +1653,7 @@ def : GCNPat< defvar OffenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_OFFEN" # InstSuffix) (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), - VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy); def : GCNPat< (Op @@ -1682,7 +1668,7 @@ def : GCNPat< defvar BothenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_BOTHEN" # InstSuffix) (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1), (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy); + SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy); def : GCNPat< (Op i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, @@ -1698,19 +1684,19 @@ def : GCNPat< class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt, PatFrag constant_ld> : GCNPat < (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset))), + i32:$offset))), (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset) >; multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET, ValueType vt, PatFrag atomic_ld> { def : GCNPat < - (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset))), + (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset))), (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset) >; def : GCNPat < - (vt (atomic_ld (MUBUFOffset v4i32:$rsrc, i32:$soffset, i16:$offset))), + (vt (atomic_ld (MUBUFOffset v4i32:$rsrc, i32:$soffset, i32:$offset))), (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset)) >; } @@ -1731,7 +1717,7 @@ multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, PatFrag ld> { def : GCNPat < - (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset))), + (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset))), (Instr_OFFSET $srsrc, $soffset, $offset) >; } @@ -1754,12 +1740,12 @@ multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen, ValueType vt, PatFrag ld> { def : GCNPat < (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset))), + i32:$soffset, i32:$offset))), (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0) >; def : GCNPat < - (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))), + (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, i32:$offset))), (InstrOffset $srsrc, $soffset, $offset, 0, 0) >; } @@ -1769,12 +1755,12 @@ multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen, MUBUF_Pseudo InstrOffset, ValueType vt, PatFrag ld_frag> { def : GCNPat < - (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in), + (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, i32:$offset), vt:$in), (InstrOffen $vaddr, $srsrc, $soffset, $offset, $in) >; def : GCNPat < - (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in), + (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, i32:$offset), vt:$in), (InstrOffset $srsrc, $soffset, $offset, $in) >; } @@ -1820,12 +1806,12 @@ multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo In ValueType vt, PatFrag atomic_st> { // Store follows atomic op convention so address is first def : GCNPat < - (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset), vt:$val), + (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset), vt:$val), (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset) >; def : GCNPat < - (atomic_st (MUBUFOffset v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), + (atomic_st (MUBUFOffset v4i32:$rsrc, i32:$soffset, i32:$offset), vt:$val), (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset)) >; } @@ -1843,7 +1829,7 @@ multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, PatFrag st> { def : GCNPat < - (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset)), + (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset)), (Instr_OFFSET $vdata, $srsrc, $soffset, $offset) >; } @@ -1857,13 +1843,13 @@ multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen, RegisterClass rc = VGPR_32> { def : GCNPat < (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, - i32:$soffset, u16imm:$offset)), + i32:$soffset, i32:$offset)), (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0) >; def : GCNPat < (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, - u16imm:$offset)), + i32:$offset)), (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0) >; } @@ -1908,7 +1894,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, def : GCNPat< (vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, 0)), - (!cast<MTBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (!cast<MTBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (as_i8timm $format), (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; @@ -1916,7 +1902,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, def : GCNPat< (vt (st v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, timm)), - (!cast<MTBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (!cast<MTBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (as_i8timm $format), (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; @@ -1924,7 +1910,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, def : GCNPat< (vt (st v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, 0)), - (!cast<MTBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + (!cast<MTBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (as_i8timm $format), (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; @@ -1934,7 +1920,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, timm:$format, timm:$auxiliary, timm)), (!cast<MTBUF_Pseudo>(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), + SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (as_i8timm $format), (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; @@ -1973,7 +1959,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, 0), (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (as_i8timm $format), + timm:$offset, (as_i8timm $format), (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; @@ -1981,7 +1967,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, timm), (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (as_i8timm $format), + timm:$offset, (as_i8timm $format), (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; @@ -1989,7 +1975,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, timm:$format, timm:$auxiliary, 0), (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, - (as_i16timm $offset), (as_i8timm $format), + timm:$offset, (as_i8timm $format), (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; @@ -1999,7 +1985,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact) getVregSrcForVT<vt>.ret:$vdata, (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1), - SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format), + SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (as_i8timm $format), (extract_cpol $auxiliary), (extract_swz $auxiliary)) >; } @@ -2710,11 +2696,11 @@ multiclass MUBUF_Real_vi_gfx90a<bits<7> op, MUBUF_Pseudo ps, bit isTFE = 0> { def _vi : MUBUF_Real_vi<op, ps>; if !not(isTFE) then { - foreach _ = BoolToList<!not(ps.FPAtomic)>.ret in + if !not(ps.FPAtomic) then def _gfx90a : MUBUF_Real_gfx90a<op, ps>; } - foreach _ = BoolToList<ps.FPAtomic>.ret in { + if ps.FPAtomic then { def _gfx90a : MUBUF_Real_gfx90a<op, ps, 0> { let SubtargetPredicate = isGFX90AOnly; let AssemblerPredicate = isGFX90AOnly; @@ -2897,11 +2883,11 @@ def BUFFER_WBINVL1_vi : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>; def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>; } // End AssemblerPredicate = isGFX8GFX9 -let SubtargetPredicate = HasAtomicFaddNoRtnInsts in { -defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_vi <0x4d>; defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_vi <0x4e>; +let SubtargetPredicate = HasAtomicFaddNoRtnInsts in { +defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_vi <0x4d>; } // End SubtargetPredicate = HasAtomicFaddNoRtnInsts let SubtargetPredicate = isGFX90APlus in { diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 26f3537ff095..85a3f763cd5a 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -26,8 +26,6 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt let isPseudo = 1; let isCodeGenOnly = 1; - let AsmMatchConverter = "cvtDS"; - string Mnemonic = opName; string AsmOperands = asmOps; @@ -65,7 +63,6 @@ class DS_Real <DS_Pseudo ps, string opName = ps.Mnemonic> : // copy relevant pseudo op flags let SubtargetPredicate = ps.SubtargetPredicate; let OtherPredicates = ps.OtherPredicates; - let AsmMatchConverter = ps.AsmMatchConverter; let SchedRW = ps.SchedRW; let mayLoad = ps.mayLoad; let mayStore = ps.mayStore; @@ -164,7 +161,6 @@ class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32, let has_vdst = 0; let has_offset = 0; - let AsmMatchConverter = "cvtDSOffset01"; } multiclass DS_1A2D_Off8_NORET_mc <string opName, RegisterClass rc = VGPR_32> { @@ -187,7 +183,6 @@ class DS_0A1D_RET_GDS<string opName, RegisterClass rc = VGPR_32, RegisterClass s let has_data1 = 0; let has_gds = 0; let gdsValue = 1; - let AsmMatchConverter = "cvtDSGds"; let hasSideEffects = 1; } @@ -220,7 +215,7 @@ multiclass DS_1A1D_RET_mc_gfx9 <string opName, RegisterClass rc = VGPR_32, let has_m0_read = 0 in { def "" : DS_1A1D_RET<opName, rc>, AtomicNoRet<!if(!eq(NoRetOp, ""), "", NoRetOp), - !if(!eq(NoRetOp, ""), 0, 1)>; + !ne(NoRetOp, "")>; } } @@ -262,8 +257,6 @@ class DS_1A2D_Off8_RET<string opName, " $vdst, $addr, $data0, $data1$offset0$offset1$gds"> { let has_offset = 0; - let AsmMatchConverter = "cvtDSOffset01"; - let hasPostISelHook = 1; } @@ -325,7 +318,6 @@ class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32> let has_offset = 0; let has_data0 = 0; let has_data1 = 0; - let AsmMatchConverter = "cvtDSOffset01"; } multiclass DS_1A_Off8_RET_mc <string opName, RegisterClass rc = VGPR_32> { @@ -345,7 +337,6 @@ class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName, let has_data1 = 0; let has_gds = 0; let gdsValue = 1; - let AsmMatchConverter = "cvtDSGds"; } class DS_0A_RET <string opName> : DS_Pseudo<opName, @@ -393,7 +384,6 @@ class DS_GWS <string opName, dag ins, string asmOps> let has_gds = 0; let gdsValue = 1; - let AsmMatchConverter = "cvtDSGds"; } class DS_GWS_0D <string opName> @@ -417,7 +407,6 @@ class DS_VOID <string opName> : DS_Pseudo<opName, let mayStore = 0; let hasSideEffects = 1; let UseNamedOperandTable = 0; - let AsmMatchConverter = ""; let has_vdst = 0; let has_addr = 0; @@ -436,7 +425,7 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag, (ins VGPR_32:$addr, data_op:$data0, offset:$offset), " $vdst, $addr, $data0$offset", [(set i32:$vdst, - (node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))] > { + (node (DS1Addr1Offset i32:$addr, i32:$offset), i32:$data0))] > { let mayLoad = 0; let mayStore = 0; @@ -494,12 +483,12 @@ let SubtargetPredicate = isGFX90APlus in { defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64", VReg_64, "ds_add_f64">; } // End SubtargetPredicate = isGFX90APlus -let SubtargetPredicate = isGFX940Plus in { +let SubtargetPredicate = HasAtomicDsPkAdd16Insts in { defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_f16">; defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_f16", VGPR_32, "ds_pk_add_f16">; defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_bf16">; defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_bf16", VGPR_32, "ds_pk_add_bf16">; -} // End SubtargetPredicate = isGFX940Plus +} // End SubtargetPredicate = HasAtomicDsPkAdd16Insts defm DS_CMPSTORE_B32 : DS_1A2D_NORET_mc<"ds_cmpstore_b32">; defm DS_CMPSTORE_F32 : DS_1A2D_NORET_mc<"ds_cmpstore_f32">; @@ -631,7 +620,7 @@ def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">; } // End SubtargetPredicate = HasDsSrc2Insts let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in { -def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, SwizzleImm>; +def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, Swizzle>; } let mayStore = 0 in { @@ -740,7 +729,7 @@ def : GCNPat < >; class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat < - (vt (frag (DS1Addr1Offset i32:$ptr, i16:$offset))), + (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), (inst $ptr, offset:$offset, (i1 gds)) >; @@ -756,7 +745,7 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> { } class DSReadPat_D16 <DS_Pseudo inst, PatFrag frag, ValueType vt> : GCNPat < - (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$in), + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in), (inst $ptr, offset:$offset, (i1 0), $in) >; @@ -800,7 +789,7 @@ def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2f16>; } class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat < - (frag vt:$value, (DS1Addr1Offset i32:$ptr, i16:$offset)), + (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds)) >; @@ -817,7 +806,7 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> { // Irritatingly, atomic_store reverses the order of operands from a // normal store. class DSAtomicWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < - (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value), + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 0)) >; @@ -965,7 +954,7 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">; } // End AddedComplexity = 100 class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0, - bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value), + bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))> { let AddedComplexity = complexity; } @@ -1014,7 +1003,7 @@ let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in { // Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode. class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0, bit gds=0> : GCNPat< - (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap), + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), (inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds))> { let AddedComplexity = complexity; } @@ -1046,7 +1035,7 @@ let SubtargetPredicate = isGFX11Plus in { // The order of src and cmp agrees with the BUFFER_ATOMIC_CMPSWAP opcode. class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0, bit gds=0> : GCNPat< - (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap), + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), (inst $ptr, getVregSrcForVT<vt>.ret:$swap, getVregSrcForVT<vt>.ret:$cmp, offset:$offset, (i1 gds))> { let AddedComplexity = complexity; } @@ -1069,8 +1058,8 @@ multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt, defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B32, i32, "atomic_swap">; defm : DSAtomicRetNoRetPat_mc<DS_ADD_RTN_U32, DS_ADD_U32, i32, "atomic_load_add">; defm : DSAtomicRetNoRetPat_mc<DS_SUB_RTN_U32, DS_SUB_U32, i32, "atomic_load_sub">; -defm : DSAtomicRetNoRetPat_mc<DS_INC_RTN_U32, DS_INC_U32, i32, "atomic_inc">; -defm : DSAtomicRetNoRetPat_mc<DS_DEC_RTN_U32, DS_DEC_U32, i32, "atomic_dec">; +defm : DSAtomicRetNoRetPat_mc<DS_INC_RTN_U32, DS_INC_U32, i32, "atomic_load_uinc_wrap">; +defm : DSAtomicRetNoRetPat_mc<DS_DEC_RTN_U32, DS_DEC_U32, i32, "atomic_load_udec_wrap">; defm : DSAtomicRetNoRetPat_mc<DS_AND_RTN_B32, DS_AND_B32, i32, "atomic_load_and">; defm : DSAtomicRetNoRetPat_mc<DS_OR_RTN_B32, DS_OR_B32, i32, "atomic_load_or">; defm : DSAtomicRetNoRetPat_mc<DS_XOR_RTN_B32, DS_XOR_B32, i32, "atomic_load_xor">; @@ -1097,8 +1086,8 @@ defm : DSAtomicRetNoRetPat_mc<DS_ADD_RTN_F32, DS_ADD_F32, f32, "atomic_load_fadd defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap">; defm : DSAtomicRetNoRetPat_mc<DS_ADD_RTN_U64, DS_ADD_U64, i64, "atomic_load_add">; defm : DSAtomicRetNoRetPat_mc<DS_SUB_RTN_U64, DS_SUB_U64, i64, "atomic_load_sub">; -defm : DSAtomicRetNoRetPat_mc<DS_INC_RTN_U64, DS_INC_U64, i64, "atomic_inc">; -defm : DSAtomicRetNoRetPat_mc<DS_DEC_RTN_U64, DS_DEC_U64, i64, "atomic_dec">; +defm : DSAtomicRetNoRetPat_mc<DS_INC_RTN_U64, DS_INC_U64, i64, "atomic_load_uinc_wrap">; +defm : DSAtomicRetNoRetPat_mc<DS_DEC_RTN_U64, DS_DEC_U64, i64, "atomic_load_udec_wrap">; defm : DSAtomicRetNoRetPat_mc<DS_AND_RTN_B64, DS_AND_B64, i64, "atomic_load_and">; defm : DSAtomicRetNoRetPat_mc<DS_OR_RTN_B64, DS_OR_B64, i64, "atomic_load_or">; defm : DSAtomicRetNoRetPat_mc<DS_XOR_RTN_B64, DS_XOR_B64, i64, "atomic_load_xor">; @@ -1124,7 +1113,7 @@ def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>; class DSAtomicRetPatIntrinsic<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat < - (vt (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value)), + (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value)), (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))> { } @@ -1133,7 +1122,7 @@ let AddedComplexity = 1 in def : DSAtomicRetPatIntrinsic<DS_ADD_F64, f64, int_amdgcn_flat_atomic_fadd_noret_local_addrspace>; } -let SubtargetPredicate = isGFX940Plus in { +let SubtargetPredicate = HasAtomicDsPkAdd16Insts in { def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_32>; let AddedComplexity = 1 in def : DSAtomicRetPat<DS_PK_ADD_F16, v2f16, atomic_load_fadd_v2f16_local_noret_32>; @@ -1146,7 +1135,7 @@ def : GCNPat < (v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)), (DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0) >; -} +} // End SubtargetPredicate = HasAtomicDsPkAdd16Insts def : Pat < (SIds_ordered_count i32:$value, i16:$offset), diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index c4e85210848a..1b05acd5c90a 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -45,13 +45,11 @@ using namespace llvm; using DecodeStatus = llvm::MCDisassembler::DecodeStatus; AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI, - MCContext &Ctx, - MCInstrInfo const *MCII) : - MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()), - TargetMaxInstBytes(Ctx.getAsmInfo()->getMaxInstLength(&STI)) { - + MCContext &Ctx, MCInstrInfo const *MCII) + : MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()), + MAI(*Ctx.getAsmInfo()), TargetMaxInstBytes(MAI.getMaxInstLength(&STI)) { // ToDo: AMDGPUDisassembler supports only VI ISA. - if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding] && !isGFX10Plus()) + if (!STI.hasFeature(AMDGPU::FeatureGCN3Encoding) && !isGFX10Plus()) report_fatal_error("Disassembly not yet supported for subtarget"); } @@ -74,7 +72,7 @@ static int insertNamedMCOperand(MCInst &MI, const MCOperand &Op, return OpIdx; } -static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm, +static DecodeStatus decodeSOPPBrTarget(MCInst &Inst, unsigned Imm, uint64_t Addr, const MCDisassembler *Decoder) { auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); @@ -115,181 +113,158 @@ static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val, uint64_t Addr, return addOperand(Inst, DAsm->DecoderName(Imm)); \ } -#define DECODE_OPERAND_REG(RegClass) \ -DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass) +// Decoder for registers, decode directly using RegClassID. Imm(8-bit) is +// number of register. Used by VGPR only and AGPR only operands. +#define DECODE_OPERAND_REG_8(RegClass) \ + static DecodeStatus Decode##RegClass##RegisterClass( \ + MCInst &Inst, unsigned Imm, uint64_t /*Addr*/, \ + const MCDisassembler *Decoder) { \ + assert(Imm < (1 << 8) && "8-bit encoding"); \ + auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); \ + return addOperand( \ + Inst, DAsm->createRegOperand(AMDGPU::RegClass##RegClassID, Imm)); \ + } -DECODE_OPERAND_REG(VGPR_32) -DECODE_OPERAND_REG(VGPR_32_Lo128) -DECODE_OPERAND_REG(VRegOrLds_32) -DECODE_OPERAND_REG(VS_32) -DECODE_OPERAND_REG(VS_64) -DECODE_OPERAND_REG(VS_128) +#define DECODE_SrcOp(Name, EncSize, OpWidth, EncImm, MandatoryLiteral, \ + ImmWidth) \ + static DecodeStatus Name(MCInst &Inst, unsigned Imm, uint64_t /*Addr*/, \ + const MCDisassembler *Decoder) { \ + assert(Imm < (1 << EncSize) && #EncSize "-bit encoding"); \ + auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); \ + return addOperand(Inst, \ + DAsm->decodeSrcOp(AMDGPUDisassembler::OpWidth, EncImm, \ + MandatoryLiteral, ImmWidth)); \ + } -DECODE_OPERAND_REG(VReg_64) -DECODE_OPERAND_REG(VReg_96) -DECODE_OPERAND_REG(VReg_128) -DECODE_OPERAND_REG(VReg_256) -DECODE_OPERAND_REG(VReg_288) -DECODE_OPERAND_REG(VReg_352) -DECODE_OPERAND_REG(VReg_384) -DECODE_OPERAND_REG(VReg_512) -DECODE_OPERAND_REG(VReg_1024) +// Decoder for registers. Imm(7-bit) is number of register, uses decodeSrcOp to +// get register class. Used by SGPR only operands. +#define DECODE_OPERAND_REG_7(RegClass, OpWidth) \ + DECODE_SrcOp(Decode##RegClass##RegisterClass, 7, OpWidth, Imm, false, 0) -DECODE_OPERAND_REG(SReg_32) -DECODE_OPERAND_REG(SReg_32_XM0_XEXEC) -DECODE_OPERAND_REG(SReg_32_XEXEC_HI) -DECODE_OPERAND_REG(SRegOrLds_32) -DECODE_OPERAND_REG(SReg_64) -DECODE_OPERAND_REG(SReg_64_XEXEC) -DECODE_OPERAND_REG(SReg_128) -DECODE_OPERAND_REG(SReg_256) -DECODE_OPERAND_REG(SReg_512) +// Decoder for registers. Imm(10-bit): Imm{7-0} is number of register, +// Imm{9} is acc(agpr or vgpr) Imm{8} should be 0 (see VOP3Pe_SMFMAC). +// Set Imm{8} to 1 (IS_VGPR) to decode using 'enum10' from decodeSrcOp. +// Used by AV_ register classes (AGPR or VGPR only register operands). +#define DECODE_OPERAND_REG_AV10(RegClass, OpWidth) \ + DECODE_SrcOp(Decode##RegClass##RegisterClass, 10, OpWidth, \ + Imm | AMDGPU::EncValues::IS_VGPR, false, 0) -DECODE_OPERAND_REG(AGPR_32) -DECODE_OPERAND_REG(AReg_64) -DECODE_OPERAND_REG(AReg_128) -DECODE_OPERAND_REG(AReg_256) -DECODE_OPERAND_REG(AReg_512) -DECODE_OPERAND_REG(AReg_1024) -DECODE_OPERAND_REG(AV_32) -DECODE_OPERAND_REG(AV_64) -DECODE_OPERAND_REG(AV_128) -DECODE_OPERAND_REG(AVDst_128) -DECODE_OPERAND_REG(AVDst_512) +// Decoder for Src(9-bit encoding) registers only. +#define DECODE_OPERAND_SRC_REG_9(RegClass, OpWidth) \ + DECODE_SrcOp(decodeOperand_##RegClass, 9, OpWidth, Imm, false, 0) -static DecodeStatus decodeOperand_VSrc16(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder) { - auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); - return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm)); -} +// Decoder for Src(9-bit encoding) AGPR, register number encoded in 9bits, set +// Imm{9} to 1 (set acc) and decode using 'enum10' from decodeSrcOp, registers +// only. +#define DECODE_OPERAND_SRC_REG_A9(RegClass, OpWidth) \ + DECODE_SrcOp(decodeOperand_##RegClass, 9, OpWidth, Imm | 512, false, 0) -static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder) { - auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); - return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm)); -} +// Decoder for 'enum10' from decodeSrcOp, Imm{0-8} is 9-bit Src encoding +// Imm{9} is acc, registers only. +#define DECODE_SRC_OPERAND_REG_AV10(RegClass, OpWidth) \ + DECODE_SrcOp(decodeOperand_##RegClass, 10, OpWidth, Imm, false, 0) -static DecodeStatus decodeOperand_VSrcV232(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder) { - auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); - return addOperand(Inst, DAsm->decodeOperand_VSrcV232(Imm)); -} +// Decoder for RegisterOperands using 9-bit Src encoding. Operand can be +// register from RegClass or immediate. Registers that don't belong to RegClass +// will be decoded and InstPrinter will report warning. Immediate will be +// decoded into constant of size ImmWidth, should match width of immediate used +// by OperandType (important for floating point types). +#define DECODE_OPERAND_SRC_REG_OR_IMM_9(RegClass, OpWidth, ImmWidth) \ + DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth, Imm, \ + false, ImmWidth) -static DecodeStatus decodeOperand_VS_16(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder) { - auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); - return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm)); -} +// Decoder for Src(9-bit encoding) AGPR or immediate. Set Imm{9} to 1 (set acc) +// and decode using 'enum10' from decodeSrcOp. +#define DECODE_OPERAND_SRC_REG_OR_IMM_A9(RegClass, OpWidth, ImmWidth) \ + DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth, \ + Imm | 512, false, ImmWidth) -static DecodeStatus decodeOperand_VS_32(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder) { - auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); - return addOperand(Inst, DAsm->decodeOperand_VS_32(Imm)); -} +#define DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(RegClass, OpWidth, ImmWidth) \ + DECODE_SrcOp(decodeOperand_##RegClass##_Deferred##_Imm##ImmWidth, 9, \ + OpWidth, Imm, true, ImmWidth) -static DecodeStatus decodeOperand_AReg_64(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder) { - auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); - return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm | 512)); -} +// Default decoders generated by tablegen: 'Decode<RegClass>RegisterClass' +// when RegisterClass is used as an operand. Most often used for destination +// operands. -static DecodeStatus decodeOperand_AReg_128(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder) { - auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); - return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm | 512)); -} +DECODE_OPERAND_REG_8(VGPR_32) +DECODE_OPERAND_REG_8(VGPR_32_Lo128) +DECODE_OPERAND_REG_8(VReg_64) +DECODE_OPERAND_REG_8(VReg_96) +DECODE_OPERAND_REG_8(VReg_128) +DECODE_OPERAND_REG_8(VReg_256) +DECODE_OPERAND_REG_8(VReg_288) +DECODE_OPERAND_REG_8(VReg_352) +DECODE_OPERAND_REG_8(VReg_384) +DECODE_OPERAND_REG_8(VReg_512) +DECODE_OPERAND_REG_8(VReg_1024) -static DecodeStatus decodeOperand_AReg_256(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder) { - auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); - return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm | 512)); -} +DECODE_OPERAND_REG_7(SReg_32, OPW32) +DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32) +DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32) +DECODE_OPERAND_REG_7(SReg_64, OPW64) +DECODE_OPERAND_REG_7(SReg_64_XEXEC, OPW64) +DECODE_OPERAND_REG_7(SReg_128, OPW128) +DECODE_OPERAND_REG_7(SReg_256, OPW256) +DECODE_OPERAND_REG_7(SReg_512, OPW512) -static DecodeStatus decodeOperand_AReg_512(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder) { - auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); - return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW512, Imm | 512)); -} +DECODE_OPERAND_REG_8(AGPR_32) +DECODE_OPERAND_REG_8(AReg_64) +DECODE_OPERAND_REG_8(AReg_128) +DECODE_OPERAND_REG_8(AReg_256) +DECODE_OPERAND_REG_8(AReg_512) +DECODE_OPERAND_REG_8(AReg_1024) -static DecodeStatus decodeOperand_AReg_1024(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder) { - auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); - return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm | 512)); -} +DECODE_OPERAND_REG_AV10(AVDst_128, OPW128) +DECODE_OPERAND_REG_AV10(AVDst_512, OPW512) -static DecodeStatus decodeOperand_VReg_64(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder) { - auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); - return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm)); -} +// Decoders for register only source RegisterOperands that use use 9-bit Src +// encoding: 'decodeOperand_<RegClass>'. -static DecodeStatus decodeOperand_VReg_128(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder) { - auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); - return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm)); -} +DECODE_OPERAND_SRC_REG_9(VGPR_32, OPW32) +DECODE_OPERAND_SRC_REG_9(VReg_64, OPW64) +DECODE_OPERAND_SRC_REG_9(VReg_128, OPW128) +DECODE_OPERAND_SRC_REG_9(VReg_256, OPW256) +DECODE_OPERAND_SRC_REG_9(VRegOrLds_32, OPW32) -static DecodeStatus decodeOperand_VReg_256(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder) { - auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); - return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm)); -} +DECODE_OPERAND_SRC_REG_A9(AGPR_32, OPW32) -static DecodeStatus decodeOperand_VReg_512(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder) { - auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); - return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW512, Imm)); -} +DECODE_SRC_OPERAND_REG_AV10(AV_32, OPW32) +DECODE_SRC_OPERAND_REG_AV10(AV_64, OPW64) +DECODE_SRC_OPERAND_REG_AV10(AV_128, OPW128) -static DecodeStatus decodeOperand_VReg_1024(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder) { - auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); - return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm)); -} +// Decoders for register or immediate RegisterOperands that use 9-bit Src +// encoding: 'decodeOperand_<RegClass>_Imm<ImmWidth>'. -static DecodeStatus decodeOperand_f32kimm(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder) { - const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); - return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm)); -} +DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_64, OPW64, 64) +DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_32, OPW32, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9(SRegOrLds_32, OPW32, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32_Lo128, OPW16, 16) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 16) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 64) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 64) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32) -static DecodeStatus decodeOperand_f16kimm(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder) { - const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); - return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm)); -} +DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_64, OPW64, 64) +DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_128, OPW128, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_256, OPW256, 64) +DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_512, OPW512, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_1024, OPW1024, 32) -static DecodeStatus -decodeOperand_VS_16_Deferred(MCInst &Inst, unsigned Imm, uint64_t Addr, - const MCDisassembler *Decoder) { - const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); - return addOperand( - Inst, DAsm->decodeSrcOp(llvm::AMDGPUDisassembler::OPW16, Imm, true)); -} +DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32_Lo128, OPW16, 16) +DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW16, 16) +DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW32, 32) -static DecodeStatus -decodeOperand_VS_32_Deferred(MCInst &Inst, unsigned Imm, uint64_t Addr, - const MCDisassembler *Decoder) { +static DecodeStatus decodeOperand_KImmFP(MCInst &Inst, unsigned Imm, + uint64_t Addr, + const MCDisassembler *Decoder) { const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); - return addOperand( - Inst, DAsm->decodeSrcOp(llvm::AMDGPUDisassembler::OPW32, Imm, true)); + return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm)); } static DecodeStatus decodeOperandVOPDDstY(MCInst &Inst, unsigned Val, @@ -381,13 +356,6 @@ DecodeAVLdSt_160RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr, Decoder); } -static DecodeStatus decodeOperand_SReg_32(MCInst &Inst, unsigned Imm, - uint64_t Addr, - const MCDisassembler *Decoder) { - auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); - return addOperand(Inst, DAsm->decodeOperand_SReg_32(Imm)); -} - #define DECODE_SDWA(DecName) \ DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName) @@ -436,7 +404,6 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes_, uint64_t Address, raw_ostream &CS) const { - CommentStream = &CS; bool IsSDWA = false; unsigned MaxInstBytesNum = std::min((size_t)TargetMaxInstBytes, Bytes_.size()); @@ -451,13 +418,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // encodings if (isGFX11Plus() && Bytes.size() >= 12 ) { DecoderUInt128 DecW = eat12Bytes(Bytes); - Res = tryDecodeInst(DecoderTableDPP8GFX1196, MI, DecW, - Address); + Res = tryDecodeInst(DecoderTableDPP8GFX1196, MI, DecW, Address, CS); if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) break; MI = MCInst(); // clear - Res = tryDecodeInst(DecoderTableDPPGFX1196, MI, DecW, - Address); + Res = tryDecodeInst(DecoderTableDPPGFX1196, MI, DecW, Address, CS); if (Res) { if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) convertVOP3PDPPInst(MI); @@ -469,7 +434,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } break; } - Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address); + Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address, CS); if (Res) break; } @@ -479,8 +444,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, if (Bytes.size() >= 8) { const uint64_t QW = eatBytes<uint64_t>(Bytes); - if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) { - Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address); + if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) { + Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS); if (Res) { if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8) == -1) @@ -491,37 +456,37 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } } - Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address); + Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address, CS); if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) break; MI = MCInst(); // clear - Res = tryDecodeInst(DecoderTableDPP8GFX1164, MI, QW, Address); + Res = tryDecodeInst(DecoderTableDPP8GFX1164, MI, QW, Address, CS); if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) break; MI = MCInst(); // clear - Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address); + Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address, CS); if (Res) break; - Res = tryDecodeInst(DecoderTableDPPGFX1164, MI, QW, Address); + Res = tryDecodeInst(DecoderTableDPPGFX1164, MI, QW, Address, CS); if (Res) { if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC) convertVOPCDPPInst(MI); break; } - Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address); + Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address, CS); if (Res) { IsSDWA = true; break; } - Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address); + Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address, CS); if (Res) { IsSDWA = true; break; } - Res = tryDecodeInst(DecoderTableSDWA1064, MI, QW, Address); + Res = tryDecodeInst(DecoderTableSDWA1064, MI, QW, Address, CS); if (Res) { IsSDWA = true; break; } - if (STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]) { - Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address); + if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem)) { + Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS); if (Res) break; } @@ -529,8 +494,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and // v_mad_mixhi_f16 for FMA variants. Try to decode using this special // table first so we print the correct name. - if (STI.getFeatureBits()[AMDGPU::FeatureFmaMixInsts]) { - Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address); + if (STI.hasFeature(AMDGPU::FeatureFmaMixInsts)) { + Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address, CS); if (Res) break; } @@ -542,64 +507,64 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // Try decode 32-bit instruction if (Bytes.size() < 4) break; const uint32_t DW = eatBytes<uint32_t>(Bytes); - Res = tryDecodeInst(DecoderTableGFX832, MI, DW, Address); + Res = tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS); if (Res) break; - Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address); + Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS); if (Res) break; - Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address); + Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS); if (Res) break; - if (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]) { - Res = tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address); + if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) { + Res = tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS); if (Res) break; } - if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) { - Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address); + if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) { + Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS); if (Res) break; } - Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address); + Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS); if (Res) break; - Res = tryDecodeInst(DecoderTableGFX1132, MI, DW, Address); + Res = tryDecodeInst(DecoderTableGFX1132, MI, DW, Address, CS); if (Res) break; if (Bytes.size() < 4) break; const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW; - if (STI.getFeatureBits()[AMDGPU::FeatureGFX940Insts]) { - Res = tryDecodeInst(DecoderTableGFX94064, MI, QW, Address); + if (STI.hasFeature(AMDGPU::FeatureGFX940Insts)) { + Res = tryDecodeInst(DecoderTableGFX94064, MI, QW, Address, CS); if (Res) break; } - if (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]) { - Res = tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address); + if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) { + Res = tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS); if (Res) break; } - Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address); + Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS); if (Res) break; - Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address); + Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address, CS); if (Res) break; - Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address); + Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS); if (Res) break; - Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address); + Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS); if (Res) break; - Res = tryDecodeInst(DecoderTableGFX1164, MI, QW, Address); + Res = tryDecodeInst(DecoderTableGFX1164, MI, QW, Address, CS); if (Res) break; - Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address); + Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address, CS); } while (false); if (Res && AMDGPU::isMAC(MI.getOpcode())) { @@ -627,7 +592,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, if (Res && (MCII->get(MI.getOpcode()).TSFlags & (SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) && - (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts])) { + (STI.hasFeature(AMDGPU::FeatureGFX90AInsts))) { // GFX90A lost TFE, its place is occupied by ACC. int TFEOpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe); @@ -714,7 +679,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const { - if (STI.getFeatureBits()[AMDGPU::FeatureGFX11]) { + if (STI.hasFeature(AMDGPU::FeatureGFX11)) { // The MCInst still has these fields even though they are no longer encoded // in the GFX11 instruction. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vm); @@ -736,12 +701,12 @@ DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const { } DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { - if (STI.getFeatureBits()[AMDGPU::FeatureGFX9] || - STI.getFeatureBits()[AMDGPU::FeatureGFX10]) { + if (STI.hasFeature(AMDGPU::FeatureGFX9) || + STI.hasFeature(AMDGPU::FeatureGFX10)) { if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::sdst)) // VOPC - insert clamp insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp); - } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) { + } else if (STI.hasFeature(AMDGPU::FeatureVolcanicIslands)) { int SDst = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst); if (SDst != -1) { // VOPC - insert VCC register as sdst @@ -883,6 +848,8 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { AMDGPU::OpName::vdata); int VAddr0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); + int RsrcIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dmask); @@ -898,14 +865,14 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { assert(VDataIdx != -1); if (BaseOpcode->BVH) { // Add A16 operand for intersect_ray instructions - if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::a16)) - addOperand(MI, MCOperand::createImm(1)); + addOperand(MI, MCOperand::createImm(BaseOpcode->A16)); return MCDisassembler::Success; } bool IsAtomic = (VDstIdx != -1); bool IsGather4 = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::Gather4; bool IsNSA = false; + bool IsPartialNSA = false; unsigned AddrSize = Info->VAddrDwords; if (isGFX10Plus()) { @@ -927,9 +894,12 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { AddrSize = 16; } else { if (AddrSize > Info->VAddrDwords) { - // The NSA encoding does not contain enough operands for the combination - // of base opcode / dimension. Should this be an error? - return MCDisassembler::Success; + if (!STI.hasFeature(AMDGPU::FeaturePartialNSAEncoding)) { + // The NSA encoding does not contain enough operands for the + // combination of base opcode / dimension. Should this be an error? + return MCDisassembler::Success; + } + IsPartialNSA = true; } } } @@ -972,17 +942,20 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { } } - // If not using NSA on GFX10+, widen address register to correct size. - unsigned NewVAddr0 = AMDGPU::NoRegister; - if (isGFX10Plus() && !IsNSA && AddrSize != Info->VAddrDwords) { - unsigned VAddr0 = MI.getOperand(VAddr0Idx).getReg(); - unsigned VAddrSub0 = MRI.getSubReg(VAddr0, AMDGPU::sub0); - VAddr0 = (VAddrSub0 != 0) ? VAddrSub0 : VAddr0; + // If not using NSA on GFX10+, widen vaddr0 address register to correct size. + // If using partial NSA on GFX11+ widen last address register. + int VAddrSAIdx = IsPartialNSA ? (RsrcIdx - 1) : VAddr0Idx; + unsigned NewVAddrSA = AMDGPU::NoRegister; + if (STI.hasFeature(AMDGPU::FeatureNSAEncoding) && (!IsNSA || IsPartialNSA) && + AddrSize != Info->VAddrDwords) { + unsigned VAddrSA = MI.getOperand(VAddrSAIdx).getReg(); + unsigned VAddrSubSA = MRI.getSubReg(VAddrSA, AMDGPU::sub0); + VAddrSA = VAddrSubSA ? VAddrSubSA : VAddrSA; - auto AddrRCID = MCII->get(NewOpcode).operands()[VAddr0Idx].RegClass; - NewVAddr0 = MRI.getMatchingSuperReg(VAddr0, AMDGPU::sub0, + auto AddrRCID = MCII->get(NewOpcode).operands()[VAddrSAIdx].RegClass; + NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0, &MRI.getRegClass(AddrRCID)); - if (NewVAddr0 == AMDGPU::NoRegister) + if (!NewVAddrSA) return MCDisassembler::Success; } @@ -997,8 +970,8 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { } } - if (NewVAddr0 != AMDGPU::NoRegister) { - MI.getOperand(VAddr0Idx) = MCOperand::createReg(NewVAddr0); + if (NewVAddrSA) { + MI.getOperand(VAddrSAIdx) = MCOperand::createReg(NewVAddrSA); } else if (IsNSA) { assert(AddrSize <= Info->VAddrDwords); MI.erase(MI.begin() + VAddr0Idx + AddrSize, @@ -1159,214 +1132,6 @@ MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID, return createRegOperand(SRegClassID, Val >> shift); } -MCOperand AMDGPUDisassembler::decodeOperand_VS_32(unsigned Val) const { - return decodeSrcOp(OPW32, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const { - return decodeSrcOp(OPW64, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_VS_128(unsigned Val) const { - return decodeSrcOp(OPW128, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const { - return decodeSrcOp(OPW16, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_VSrcV216(unsigned Val) const { - return decodeSrcOp(OPWV216, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_VSrcV232(unsigned Val) const { - return decodeSrcOp(OPWV232, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32_Lo128(unsigned Val) const { - return createRegOperand(AMDGPU::VGPR_32_Lo128RegClassID, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const { - // Some instructions have operand restrictions beyond what the encoding - // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra - // high bit. - Val &= 255; - - return createRegOperand(AMDGPU::VGPR_32RegClassID, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_VRegOrLds_32(unsigned Val) const { - return decodeSrcOp(OPW32, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_AGPR_32(unsigned Val) const { - return createRegOperand(AMDGPU::AGPR_32RegClassID, Val & 255); -} - -MCOperand AMDGPUDisassembler::decodeOperand_AReg_64(unsigned Val) const { - return createRegOperand(AMDGPU::AReg_64RegClassID, Val & 255); -} - -MCOperand AMDGPUDisassembler::decodeOperand_AReg_128(unsigned Val) const { - return createRegOperand(AMDGPU::AReg_128RegClassID, Val & 255); -} - -MCOperand AMDGPUDisassembler::decodeOperand_AReg_256(unsigned Val) const { - return createRegOperand(AMDGPU::AReg_256RegClassID, Val & 255); -} - -MCOperand AMDGPUDisassembler::decodeOperand_AReg_288(unsigned Val) const { - return createRegOperand(AMDGPU::AReg_288RegClassID, Val & 255); -} - -MCOperand AMDGPUDisassembler::decodeOperand_AReg_320(unsigned Val) const { - return createRegOperand(AMDGPU::AReg_320RegClassID, Val & 255); -} - -MCOperand AMDGPUDisassembler::decodeOperand_AReg_352(unsigned Val) const { - return createRegOperand(AMDGPU::AReg_352RegClassID, Val & 255); -} - -MCOperand AMDGPUDisassembler::decodeOperand_AReg_384(unsigned Val) const { - return createRegOperand(AMDGPU::AReg_384RegClassID, Val & 255); -} - - -MCOperand AMDGPUDisassembler::decodeOperand_AReg_512(unsigned Val) const { - return createRegOperand(AMDGPU::AReg_512RegClassID, Val & 255); -} - -MCOperand AMDGPUDisassembler::decodeOperand_AReg_1024(unsigned Val) const { - return createRegOperand(AMDGPU::AReg_1024RegClassID, Val & 255); -} - -MCOperand AMDGPUDisassembler::decodeOperand_AV_32(unsigned Val) const { - return decodeSrcOp(OPW32, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_AV_64(unsigned Val) const { - return decodeSrcOp(OPW64, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_AV_128(unsigned Val) const { - return decodeSrcOp(OPW128, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_AVDst_128(unsigned Val) const { - using namespace AMDGPU::EncValues; - assert((Val & IS_VGPR) == 0); // Val{8} is not encoded but assumed to be 1. - return decodeSrcOp(OPW128, Val | IS_VGPR); -} - -MCOperand AMDGPUDisassembler::decodeOperand_AVDst_512(unsigned Val) const { - using namespace AMDGPU::EncValues; - assert((Val & IS_VGPR) == 0); // Val{8} is not encoded but assumed to be 1. - return decodeSrcOp(OPW512, Val | IS_VGPR); -} - -MCOperand AMDGPUDisassembler::decodeOperand_VReg_64(unsigned Val) const { - return createRegOperand(AMDGPU::VReg_64RegClassID, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_VReg_96(unsigned Val) const { - return createRegOperand(AMDGPU::VReg_96RegClassID, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_VReg_128(unsigned Val) const { - return createRegOperand(AMDGPU::VReg_128RegClassID, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_VReg_256(unsigned Val) const { - return createRegOperand(AMDGPU::VReg_256RegClassID, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_VReg_288(unsigned Val) const { - return createRegOperand(AMDGPU::VReg_288RegClassID, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_VReg_320(unsigned Val) const { - return createRegOperand(AMDGPU::VReg_320RegClassID, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_VReg_352(unsigned Val) const { - return createRegOperand(AMDGPU::VReg_352RegClassID, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_VReg_384(unsigned Val) const { - return createRegOperand(AMDGPU::VReg_384RegClassID, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_VReg_512(unsigned Val) const { - return createRegOperand(AMDGPU::VReg_512RegClassID, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_VReg_1024(unsigned Val) const { - return createRegOperand(AMDGPU::VReg_1024RegClassID, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_SReg_32(unsigned Val) const { - // table-gen generated disassembler doesn't care about operand types - // leaving only registry class so SSrc_32 operand turns into SReg_32 - // and therefore we accept immediates and literals here as well - return decodeSrcOp(OPW32, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XM0_XEXEC( - unsigned Val) const { - // SReg_32_XM0 is SReg_32 without M0 or EXEC_LO/EXEC_HI - return decodeOperand_SReg_32(Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XEXEC_HI( - unsigned Val) const { - // SReg_32_XM0 is SReg_32 without EXEC_HI - return decodeOperand_SReg_32(Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_SRegOrLds_32(unsigned Val) const { - // table-gen generated disassembler doesn't care about operand types - // leaving only registry class so SSrc_32 operand turns into SReg_32 - // and therefore we accept immediates and literals here as well - return decodeSrcOp(OPW32, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_SReg_64(unsigned Val) const { - return decodeSrcOp(OPW64, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_SReg_64_XEXEC(unsigned Val) const { - return decodeSrcOp(OPW64, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_SReg_128(unsigned Val) const { - return decodeSrcOp(OPW128, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_SReg_256(unsigned Val) const { - return decodeDstOp(OPW256, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_SReg_288(unsigned Val) const { - return decodeDstOp(OPW288, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_SReg_320(unsigned Val) const { - return decodeDstOp(OPW320, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_SReg_352(unsigned Val) const { - return decodeDstOp(OPW352, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_SReg_384(unsigned Val) const { - return decodeDstOp(OPW384, Val); -} - -MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const { - return decodeDstOp(OPW512, Val); -} - // Decode Literals for insts which always have a literal in the encoding MCOperand AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const { @@ -1410,21 +1175,21 @@ MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) { static int64_t getInlineImmVal32(unsigned Imm) { switch (Imm) { case 240: - return FloatToBits(0.5f); + return llvm::bit_cast<uint32_t>(0.5f); case 241: - return FloatToBits(-0.5f); + return llvm::bit_cast<uint32_t>(-0.5f); case 242: - return FloatToBits(1.0f); + return llvm::bit_cast<uint32_t>(1.0f); case 243: - return FloatToBits(-1.0f); + return llvm::bit_cast<uint32_t>(-1.0f); case 244: - return FloatToBits(2.0f); + return llvm::bit_cast<uint32_t>(2.0f); case 245: - return FloatToBits(-2.0f); + return llvm::bit_cast<uint32_t>(-2.0f); case 246: - return FloatToBits(4.0f); + return llvm::bit_cast<uint32_t>(4.0f); case 247: - return FloatToBits(-4.0f); + return llvm::bit_cast<uint32_t>(-4.0f); case 248: // 1 / (2 * PI) return 0x3e22f983; default: @@ -1435,21 +1200,21 @@ static int64_t getInlineImmVal32(unsigned Imm) { static int64_t getInlineImmVal64(unsigned Imm) { switch (Imm) { case 240: - return DoubleToBits(0.5); + return llvm::bit_cast<uint64_t>(0.5); case 241: - return DoubleToBits(-0.5); + return llvm::bit_cast<uint64_t>(-0.5); case 242: - return DoubleToBits(1.0); + return llvm::bit_cast<uint64_t>(1.0); case 243: - return DoubleToBits(-1.0); + return llvm::bit_cast<uint64_t>(-1.0); case 244: - return DoubleToBits(2.0); + return llvm::bit_cast<uint64_t>(2.0); case 245: - return DoubleToBits(-2.0); + return llvm::bit_cast<uint64_t>(-2.0); case 246: - return DoubleToBits(4.0); + return llvm::bit_cast<uint64_t>(4.0); case 247: - return DoubleToBits(-4.0); + return llvm::bit_cast<uint64_t>(-4.0); case 248: // 1 / (2 * PI) return 0x3fc45f306dc9c882; default: @@ -1482,23 +1247,21 @@ static int64_t getInlineImmVal16(unsigned Imm) { } } -MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) { +MCOperand AMDGPUDisassembler::decodeFPImmed(unsigned ImmWidth, unsigned Imm) { assert(Imm >= AMDGPU::EncValues::INLINE_FLOATING_C_MIN && Imm <= AMDGPU::EncValues::INLINE_FLOATING_C_MAX); // ToDo: case 248: 1/(2*PI) - is allowed only on VI - switch (Width) { - case OPW32: - case OPW128: // splat constants - case OPW512: - case OPW1024: - case OPWV232: + // ImmWidth 0 is a default case where operand should not allow immediates. + // Imm value is still decoded into 32 bit immediate operand, inst printer will + // use it to print verbose error message. + switch (ImmWidth) { + case 0: + case 32: return MCOperand::createImm(getInlineImmVal32(Imm)); - case OPW64: - case OPW256: + case 64: return MCOperand::createImm(getInlineImmVal64(Imm)); - case OPW16: - case OPWV216: + case 16: return MCOperand::createImm(getInlineImmVal16(Imm)); default: llvm_unreachable("implement me"); @@ -1612,7 +1375,8 @@ int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const { } MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val, - bool MandatoryLiteral) const { + bool MandatoryLiteral, + unsigned ImmWidth) const { using namespace AMDGPU::EncValues; assert(Val < 1024); // enum10 @@ -1639,7 +1403,7 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val, return decodeIntImmed(Val); if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX) - return decodeFPImmed(Width, Val); + return decodeFPImmed(ImmWidth, Val); if (Val == LITERAL_CONST) { if (MandatoryLiteral) @@ -1662,26 +1426,6 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val, } } -MCOperand AMDGPUDisassembler::decodeDstOp(const OpWidthTy Width, unsigned Val) const { - using namespace AMDGPU::EncValues; - - assert(Val < 128); - assert(Width == OPW256 || Width == OPW512); - - if (Val <= SGPR_MAX) { - // "SGPR_MIN <= Val" is always true and causes compilation warning. - static_assert(SGPR_MIN == 0); - return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN); - } - - int TTmpIdx = getTTmpIdx(Val); - if (TTmpIdx >= 0) { - return createSRegOperand(getTtmpClassId(Width), TTmpIdx); - } - - llvm_unreachable("unknown dst register"); -} - // Bit 0 of DstY isn't stored in the instruction, because it's always the // opposite of bit 0 of DstX. MCOperand AMDGPUDisassembler::decodeVOPDDstYOp(MCInst &Inst, @@ -1764,12 +1508,13 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { } MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width, - const unsigned Val) const { + const unsigned Val, + unsigned ImmWidth) const { using namespace AMDGPU::SDWA; using namespace AMDGPU::EncValues; - if (STI.getFeatureBits()[AMDGPU::FeatureGFX9] || - STI.getFeatureBits()[AMDGPU::FeatureGFX10]) { + if (STI.hasFeature(AMDGPU::FeatureGFX9) || + STI.hasFeature(AMDGPU::FeatureGFX10)) { // XXX: cast to int is needed to avoid stupid warning: // compare with unsigned is always true if (int(SDWA9EncValues::SRC_VGPR_MIN) <= int(Val) && @@ -1795,31 +1540,31 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width, return decodeIntImmed(SVal); if (INLINE_FLOATING_C_MIN <= SVal && SVal <= INLINE_FLOATING_C_MAX) - return decodeFPImmed(Width, SVal); + return decodeFPImmed(ImmWidth, SVal); return decodeSpecialReg32(SVal); - } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) { + } else if (STI.hasFeature(AMDGPU::FeatureVolcanicIslands)) { return createRegOperand(getVgprClassId(Width), Val); } llvm_unreachable("unsupported target"); } MCOperand AMDGPUDisassembler::decodeSDWASrc16(unsigned Val) const { - return decodeSDWASrc(OPW16, Val); + return decodeSDWASrc(OPW16, Val, 16); } MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const { - return decodeSDWASrc(OPW32, Val); + return decodeSDWASrc(OPW32, Val, 32); } MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const { using namespace AMDGPU::SDWA; - assert((STI.getFeatureBits()[AMDGPU::FeatureGFX9] || - STI.getFeatureBits()[AMDGPU::FeatureGFX10]) && + assert((STI.hasFeature(AMDGPU::FeatureGFX9) || + STI.hasFeature(AMDGPU::FeatureGFX10)) && "SDWAVopcDst should be present only on GFX9+"); - bool IsWave64 = STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64]; + bool IsWave64 = STI.hasFeature(AMDGPU::FeatureWavefrontSize64); if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) { Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK; @@ -1840,18 +1585,19 @@ MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const { } MCOperand AMDGPUDisassembler::decodeBoolReg(unsigned Val) const { - return STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ? - decodeOperand_SReg_64(Val) : decodeOperand_SReg_32(Val); + return STI.hasFeature(AMDGPU::FeatureWavefrontSize64) + ? decodeSrcOp(OPW64, Val) + : decodeSrcOp(OPW32, Val); } bool AMDGPUDisassembler::isVI() const { - return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]; + return STI.hasFeature(AMDGPU::FeatureVolcanicIslands); } bool AMDGPUDisassembler::isGFX9() const { return AMDGPU::isGFX9(STI); } bool AMDGPUDisassembler::isGFX90A() const { - return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]; + return STI.hasFeature(AMDGPU::FeatureGFX90AInsts); } bool AMDGPUDisassembler::isGFX9Plus() const { return AMDGPU::isGFX9Plus(STI); } @@ -1863,7 +1609,7 @@ bool AMDGPUDisassembler::isGFX10Plus() const { } bool AMDGPUDisassembler::isGFX11() const { - return STI.getFeatureBits()[AMDGPU::FeatureGFX11]; + return STI.hasFeature(AMDGPU::FeatureGFX11); } bool AMDGPUDisassembler::isGFX11Plus() const { @@ -1872,16 +1618,21 @@ bool AMDGPUDisassembler::isGFX11Plus() const { bool AMDGPUDisassembler::hasArchitectedFlatScratch() const { - return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch]; + return STI.hasFeature(AMDGPU::FeatureArchitectedFlatScratch); } //===----------------------------------------------------------------------===// // AMDGPU specific symbol handling //===----------------------------------------------------------------------===// +#define GET_FIELD(MASK) (AMDHSA_BITS_GET(FourByteBuffer, MASK)) #define PRINT_DIRECTIVE(DIRECTIVE, MASK) \ do { \ - KdStream << Indent << DIRECTIVE " " \ - << ((FourByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n'; \ + KdStream << Indent << DIRECTIVE " " << GET_FIELD(MASK) << '\n'; \ + } while (0) +#define PRINT_PSEUDO_DIRECTIVE_COMMENT(DIRECTIVE, MASK) \ + do { \ + KdStream << Indent << MAI.getCommentString() << ' ' << DIRECTIVE " " \ + << GET_FIELD(MASK) << '\n'; \ } while (0) // NOLINTNEXTLINE(readability-identifier-naming) @@ -1896,11 +1647,11 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1( // simply calculate the inverse of what the assembler does. uint32_t GranulatedWorkitemVGPRCount = - (FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT) >> - COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT; + GET_FIELD(COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT); - uint32_t NextFreeVGPR = (GranulatedWorkitemVGPRCount + 1) * - AMDGPU::IsaInfo::getVGPREncodingGranule(&STI); + uint32_t NextFreeVGPR = + (GranulatedWorkitemVGPRCount + 1) * + AMDGPU::IsaInfo::getVGPREncodingGranule(&STI, EnableWavefrontSize32); KdStream << Indent << ".amdhsa_next_free_vgpr " << NextFreeVGPR << '\n'; @@ -1924,8 +1675,7 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1( // The disassembler cannot recover the original values of those 3 directives. uint32_t GranulatedWavefrontSGPRCount = - (FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT) >> - COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT; + GET_FIELD(COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT); if (isGFX10Plus() && GranulatedWavefrontSGPRCount) return MCDisassembler::Fail; @@ -2035,7 +1785,46 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC2( return MCDisassembler::Success; } +// NOLINTNEXTLINE(readability-identifier-naming) +MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3( + uint32_t FourByteBuffer, raw_string_ostream &KdStream) const { + using namespace amdhsa; + StringRef Indent = "\t"; + if (isGFX90A()) { + KdStream << Indent << ".amdhsa_accum_offset " + << (GET_FIELD(COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET) + 1) * 4 + << '\n'; + if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX90A_RESERVED0) + return MCDisassembler::Fail; + PRINT_DIRECTIVE(".amdhsa_tg_split", COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT); + if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX90A_RESERVED1) + return MCDisassembler::Fail; + } else if (isGFX10Plus()) { + if (!EnableWavefrontSize32 || !*EnableWavefrontSize32) { + PRINT_DIRECTIVE(".amdhsa_shared_vgpr_count", + COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT); + } else { + PRINT_PSEUDO_DIRECTIVE_COMMENT( + "SHARED_VGPR_COUNT", COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT); + } + PRINT_PSEUDO_DIRECTIVE_COMMENT("INST_PREF_SIZE", + COMPUTE_PGM_RSRC3_GFX10_PLUS_INST_PREF_SIZE); + PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_START", + COMPUTE_PGM_RSRC3_GFX10_PLUS_TRAP_ON_START); + PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_END", + COMPUTE_PGM_RSRC3_GFX10_PLUS_TRAP_ON_END); + if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED0) + return MCDisassembler::Fail; + PRINT_PSEUDO_DIRECTIVE_COMMENT("IMAGE_OP", + COMPUTE_PGM_RSRC3_GFX10_PLUS_TRAP_ON_START); + } else if (FourByteBuffer) { + return MCDisassembler::Fail; + } + return MCDisassembler::Success; +} +#undef PRINT_PSEUDO_DIRECTIVE_COMMENT #undef PRINT_DIRECTIVE +#undef GET_FIELD MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeKernelDescriptorDirective( @@ -2103,30 +1892,16 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective( return MCDisassembler::Success; case amdhsa::COMPUTE_PGM_RSRC3_OFFSET: - // COMPUTE_PGM_RSRC3 - // - Only set for GFX10, GFX6-9 have this to be 0. - // - Currently no directives directly control this. FourByteBuffer = DE.getU32(Cursor); - if (!isGFX10Plus() && FourByteBuffer) { - return MCDisassembler::Fail; - } - return MCDisassembler::Success; + return decodeCOMPUTE_PGM_RSRC3(FourByteBuffer, KdStream); case amdhsa::COMPUTE_PGM_RSRC1_OFFSET: FourByteBuffer = DE.getU32(Cursor); - if (decodeCOMPUTE_PGM_RSRC1(FourByteBuffer, KdStream) == - MCDisassembler::Fail) { - return MCDisassembler::Fail; - } - return MCDisassembler::Success; + return decodeCOMPUTE_PGM_RSRC1(FourByteBuffer, KdStream); case amdhsa::COMPUTE_PGM_RSRC2_OFFSET: FourByteBuffer = DE.getU32(Cursor); - if (decodeCOMPUTE_PGM_RSRC2(FourByteBuffer, KdStream) == - MCDisassembler::Fail) { - return MCDisassembler::Fail; - } - return MCDisassembler::Success; + return decodeCOMPUTE_PGM_RSRC2(FourByteBuffer, KdStream); case amdhsa::KERNEL_CODE_PROPERTIES_OFFSET: using namespace amdhsa; @@ -2161,7 +1936,7 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective( KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32); } - if (AMDGPU::getAmdhsaCodeObjectVersion() >= 5) + if (AMDGPU::getAmdhsaCodeObjectVersion() >= AMDGPU::AMDHSA_COV5) PRINT_DIRECTIVE(".amdhsa_uses_dynamic_stack", KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK); @@ -2192,6 +1967,20 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeKernelDescriptor( if (Bytes.size() != 64 || KdAddress % 64 != 0) return MCDisassembler::Fail; + // FIXME: We can't actually decode "in order" as is done below, as e.g. GFX10 + // requires us to know the setting of .amdhsa_wavefront_size32 in order to + // accurately produce .amdhsa_next_free_vgpr, and they appear in the wrong + // order. Workaround this by first looking up .amdhsa_wavefront_size32 here + // when required. + if (isGFX10Plus()) { + uint16_t KernelCodeProperties = + support::endian::read16(&Bytes[amdhsa::KERNEL_CODE_PROPERTIES_OFFSET], + support::endianness::little); + EnableWavefrontSize32 = + AMDHSA_BITS_GET(KernelCodeProperties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32); + } + std::string Kd; raw_string_ostream KdStream(Kd); KdStream << ".amdhsa_kernel " << KdName << '\n'; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 870f7b17df20..444312473a5f 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -16,14 +16,16 @@ #define LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H #include "llvm/ADT/APInt.h" +#include "llvm/ADT/SmallString.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" -#include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" #include "llvm/Support/DataExtractor.h" #include <memory> namespace llvm { +class MCAsmInfo; class MCInst; class MCOperand; class MCSubtargetInfo; @@ -91,10 +93,12 @@ class AMDGPUDisassembler : public MCDisassembler { private: std::unique_ptr<MCInstrInfo const> const MCII; const MCRegisterInfo &MRI; + const MCAsmInfo &MAI; const unsigned TargetMaxInstBytes; mutable ArrayRef<uint8_t> Bytes; mutable uint32_t Literal; mutable bool HasLiteral; + mutable std::optional<bool> EnableWavefrontSize32; public: AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, @@ -115,14 +119,25 @@ public: template <typename InsnType> DecodeStatus tryDecodeInst(const uint8_t *Table, MCInst &MI, InsnType Inst, - uint64_t Address) const { + uint64_t Address, raw_ostream &Comments) const { assert(MI.getOpcode() == 0); assert(MI.getNumOperands() == 0); MCInst TmpInst; HasLiteral = false; const auto SavedBytes = Bytes; - if (decodeInstruction(Table, TmpInst, Inst, Address, this, STI)) { + + SmallString<64> LocalComments; + raw_svector_ostream LocalCommentStream(LocalComments); + CommentStream = &LocalCommentStream; + + DecodeStatus Res = + decodeInstruction(Table, TmpInst, Inst, Address, this, STI); + + CommentStream = nullptr; + + if (Res != Fail) { MI = TmpInst; + Comments << LocalComments; return MCDisassembler::Success; } Bytes = SavedBytes; @@ -155,6 +170,13 @@ public: DecodeStatus decodeCOMPUTE_PGM_RSRC2(uint32_t FourByteBuffer, raw_string_ostream &KdStream) const; + /// Decode as directives that handle COMPUTE_PGM_RSRC3. + /// \param FourByteBuffer - Bytes holding contents of COMPUTE_PGM_RSRC3. + /// \param KdStream - Stream to write the disassembled directives to. + // NOLINTNEXTLINE(readability-identifier-naming) + DecodeStatus decodeCOMPUTE_PGM_RSRC3(uint32_t FourByteBuffer, + raw_string_ostream &KdStream) const; + DecodeStatus convertEXPInst(MCInst &MI) const; DecodeStatus convertVINTERPInst(MCInst &MI) const; DecodeStatus convertFMAanyK(MCInst &MI, int ImmLitIdx) const; @@ -166,58 +188,6 @@ public: DecodeStatus convertVOPCDPPInst(MCInst &MI) const; void convertMacDPPInst(MCInst &MI) const; - MCOperand decodeOperand_VGPR_32(unsigned Val) const; - MCOperand decodeOperand_VGPR_32_Lo128(unsigned Val) const; - MCOperand decodeOperand_VRegOrLds_32(unsigned Val) const; - - MCOperand decodeOperand_VS_32(unsigned Val) const; - MCOperand decodeOperand_VS_64(unsigned Val) const; - MCOperand decodeOperand_VS_128(unsigned Val) const; - MCOperand decodeOperand_VSrc16(unsigned Val) const; - MCOperand decodeOperand_VSrcV216(unsigned Val) const; - MCOperand decodeOperand_VSrcV232(unsigned Val) const; - - MCOperand decodeOperand_VReg_64(unsigned Val) const; - MCOperand decodeOperand_VReg_96(unsigned Val) const; - MCOperand decodeOperand_VReg_128(unsigned Val) const; - MCOperand decodeOperand_VReg_256(unsigned Val) const; - MCOperand decodeOperand_VReg_288(unsigned Val) const; - MCOperand decodeOperand_VReg_320(unsigned Val) const; - MCOperand decodeOperand_VReg_352(unsigned Val) const; - MCOperand decodeOperand_VReg_384(unsigned Val) const; - MCOperand decodeOperand_VReg_512(unsigned Val) const; - MCOperand decodeOperand_VReg_1024(unsigned Val) const; - - MCOperand decodeOperand_SReg_32(unsigned Val) const; - MCOperand decodeOperand_SReg_32_XM0_XEXEC(unsigned Val) const; - MCOperand decodeOperand_SReg_32_XEXEC_HI(unsigned Val) const; - MCOperand decodeOperand_SRegOrLds_32(unsigned Val) const; - MCOperand decodeOperand_SReg_64(unsigned Val) const; - MCOperand decodeOperand_SReg_64_XEXEC(unsigned Val) const; - MCOperand decodeOperand_SReg_128(unsigned Val) const; - MCOperand decodeOperand_SReg_256(unsigned Val) const; - MCOperand decodeOperand_SReg_288(unsigned Val) const; - MCOperand decodeOperand_SReg_320(unsigned Val) const; - MCOperand decodeOperand_SReg_352(unsigned Val) const; - MCOperand decodeOperand_SReg_384(unsigned Val) const; - MCOperand decodeOperand_SReg_512(unsigned Val) const; - - MCOperand decodeOperand_AGPR_32(unsigned Val) const; - MCOperand decodeOperand_AReg_64(unsigned Val) const; - MCOperand decodeOperand_AReg_128(unsigned Val) const; - MCOperand decodeOperand_AReg_256(unsigned Val) const; - MCOperand decodeOperand_AReg_288(unsigned Val) const; - MCOperand decodeOperand_AReg_320(unsigned Val) const; - MCOperand decodeOperand_AReg_352(unsigned Val) const; - MCOperand decodeOperand_AReg_384(unsigned Val) const; - MCOperand decodeOperand_AReg_512(unsigned Val) const; - MCOperand decodeOperand_AReg_1024(unsigned Val) const; - MCOperand decodeOperand_AV_32(unsigned Val) const; - MCOperand decodeOperand_AV_64(unsigned Val) const; - MCOperand decodeOperand_AV_128(unsigned Val) const; - MCOperand decodeOperand_AVDst_128(unsigned Val) const; - MCOperand decodeOperand_AVDst_512(unsigned Val) const; - enum OpWidthTy { OPW32, OPW64, @@ -244,18 +214,21 @@ public: unsigned getTtmpClassId(const OpWidthTy Width) const; static MCOperand decodeIntImmed(unsigned Imm); - static MCOperand decodeFPImmed(OpWidthTy Width, unsigned Imm); + static MCOperand decodeFPImmed(unsigned ImmWidth, unsigned Imm); + MCOperand decodeMandatoryLiteralConstant(unsigned Imm) const; MCOperand decodeLiteralConstant() const; MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val, - bool MandatoryLiteral = false) const; - MCOperand decodeDstOp(const OpWidthTy Width, unsigned Val) const; + bool MandatoryLiteral = false, + unsigned ImmWidth = 0) const; + MCOperand decodeVOPDDstYOp(MCInst &Inst, unsigned Val) const; MCOperand decodeSpecialReg32(unsigned Val) const; MCOperand decodeSpecialReg64(unsigned Val) const; - MCOperand decodeSDWASrc(const OpWidthTy Width, unsigned Val) const; + MCOperand decodeSDWASrc(const OpWidthTy Width, unsigned Val, + unsigned ImmWidth = 0) const; MCOperand decodeSDWASrc16(unsigned Val) const; MCOperand decodeSDWASrc32(unsigned Val) const; MCOperand decodeSDWAVopcDst(unsigned Val) const; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 09f59af06589..5c86d80e7dd2 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -466,7 +466,7 @@ class FLAT_AtomicRet_Pseudo<string opName, dag outs, dag ins, let PseudoInstr = NAME # "_RTN"; } -multiclass FLAT_Atomic_Pseudo< +multiclass FLAT_Atomic_Pseudo_NO_RTN< string opName, RegisterClass vdst_rc, ValueType vt, @@ -484,7 +484,16 @@ multiclass FLAT_Atomic_Pseudo< let FPAtomic = isFP; let AddedComplexity = -1; // Prefer global atomics if available } +} +multiclass FLAT_Atomic_Pseudo_RTN< + string opName, + RegisterClass vdst_rc, + ValueType vt, + ValueType data_vt = vt, + RegisterClass data_rc = vdst_rc, + bit isFP = isFloatType<data_vt>.ret, + RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> { def _RTN : FLAT_AtomicRet_Pseudo <opName, (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst), (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), @@ -496,6 +505,18 @@ multiclass FLAT_Atomic_Pseudo< } } +multiclass FLAT_Atomic_Pseudo< + string opName, + RegisterClass vdst_rc, + ValueType vt, + ValueType data_vt = vt, + RegisterClass data_rc = vdst_rc, + bit isFP = isFloatType<data_vt>.ret, + RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> { + defm "" : FLAT_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc, isFP, data_op>; + defm "" : FLAT_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc, isFP, data_op>; +} + multiclass FLAT_Global_Atomic_Pseudo_NO_RTN< string opName, RegisterClass vdst_rc, @@ -709,11 +730,14 @@ let SubtargetPredicate = isGFX90APlus in { defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>; } // End SubtargetPredicate = isGFX90APlus -let SubtargetPredicate = isGFX940Plus in { +let SubtargetPredicate = HasAtomicFlatPkAdd16Insts in { defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16>; - defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", VGPR_32, v2f16>; - defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", VGPR_32, v2f16>; -} // End SubtargetPredicate = isGFX940Plus + let FPAtomic = 1 in + defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", VGPR_32, v2i16>; +} // End SubtargetPredicate = HasAtomicFlatPkAdd16Insts + +let SubtargetPredicate = HasAtomicGlobalPkAddBF16Inst, FPAtomic = 1 in + defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", VGPR_32, v2i16>; // GFX7-, GFX10-, GFX11-only flat instructions. let SubtargetPredicate = isGFX7GFX10GFX11 in { @@ -917,7 +941,7 @@ let OtherPredicates = [HasAtomicFaddNoRtnInsts] in defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN < "global_atomic_add_f32", VGPR_32, f32 >; -let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in +let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN < "global_atomic_pk_add_f16", VGPR_32, v2f16 >; @@ -925,7 +949,7 @@ let OtherPredicates = [HasAtomicFaddRtnInsts] in defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN < "global_atomic_add_f32", VGPR_32, f32 >; -let OtherPredicates = [isGFX90APlus] in +let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN < "global_atomic_pk_add_f16", VGPR_32, v2f16 >; @@ -937,73 +961,73 @@ let OtherPredicates = [isGFX90APlus] in // Patterns for global loads with no offset. class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (FlatOffset i64:$vaddr, i16:$offset))), + (vt (node (FlatOffset i64:$vaddr, i32:$offset))), (inst $vaddr, $offset) >; class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node (FlatOffset (i64 VReg_64:$vaddr), i16:$offset), vt:$in), + (node (FlatOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in), (inst $vaddr, $offset, 0, $in) >; class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node (GlobalOffset (i64 VReg_64:$vaddr), i16:$offset), vt:$in), + (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in), (inst $vaddr, $offset, 0, $in) >; class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$in)), + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)), (inst $saddr, $voffset, $offset, 0, $in) >; class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i16:$offset))), + (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset))), (inst $vaddr, $offset) >; class GlobalLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset))), + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), (inst $saddr, $voffset, $offset, 0) >; class GlobalStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset)), + (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset)), (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset) >; class GlobalAtomicStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$data), + (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$data), (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset) >; class GlobalAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), data_vt:$data)), + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), data_vt:$data)), (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset) >; class GlobalAtomicNoRtnSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$data), + (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$data), (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset) >; class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node vt:$data, (FlatOffset i64:$vaddr, i16:$offset)), + (node vt:$data, (FlatOffset i64:$vaddr, i32:$offset)), (inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset) >; class FlatStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node vt:$data, (GlobalOffset i64:$vaddr, i16:$offset)), + (node vt:$data, (GlobalOffset i64:$vaddr, i32:$offset)), (inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset) >; class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < // atomic store follows atomic binop convention so the address comes // first. - (node (FlatOffset i64:$vaddr, i16:$offset), vt:$data), + (node (FlatOffset i64:$vaddr, i32:$offset), vt:$data), (inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset) >; @@ -1011,7 +1035,7 @@ class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : GCNPat < // atomic store follows atomic binop convention so the address comes // first. - (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data), + (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data), (inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset) >; @@ -1020,17 +1044,17 @@ multiclass FlatAtomicPat <string inst, string node, ValueType vt, defvar rtnNode = !cast<PatFrags>(node#"_"#vt.Size); defvar noRtnNode = !cast<PatFrags>(node#"_noret_"#vt.Size); - def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)), + def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; let AddedComplexity = 1 in - def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)), + def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>; } class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, ValueType data_vt = vt> : GCNPat < - (vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)), + (vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)), (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset) >; @@ -1063,49 +1087,49 @@ multiclass FlatSignedAtomicPatWithAddrSpace<string inst, string intr, string add } class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset))), + (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))), (inst $vaddr, $offset) >; class ScratchLoadSignedPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset), vt:$in), + (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset), vt:$in), (inst $vaddr, $offset, 0, $in) >; class ScratchStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node vt:$data, (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset)), + (node vt:$data, (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset)), (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $offset) >; class ScratchLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset))), + (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset))), (inst $saddr, $offset) >; class ScratchLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset), vt:$in)), + (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset), vt:$in)), (inst $saddr, $offset, 0, $in) >; class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node vt:$data, (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset)), + (node vt:$data, (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset)), (inst getVregSrcForVT<vt>.ret:$data, $saddr, $offset) >; class ScratchLoadSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i16:$offset))), + (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset))), (inst $vaddr, $saddr, $offset, 0) >; class ScratchStoreSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i16:$offset)), + (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset)), (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset) >; class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < - (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i16:$offset), vt:$in)), + (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset), vt:$in)), (inst $vaddr, $saddr, $offset, 0, $in) >; @@ -1160,8 +1184,8 @@ def : FlatStoreAtomicPat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; foreach as = [ "flat", "global" ] in { defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>; defm : FlatAtomicPat <"FLAT_ATOMIC_SUB", "atomic_load_sub_"#as, i32>; -defm : FlatAtomicPat <"FLAT_ATOMIC_INC", "atomic_inc_"#as, i32>; -defm : FlatAtomicPat <"FLAT_ATOMIC_DEC", "atomic_dec_"#as, i32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_INC", "atomic_load_uinc_wrap_"#as, i32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_DEC", "atomic_load_udec_wrap_"#as, i32>; defm : FlatAtomicPat <"FLAT_ATOMIC_AND", "atomic_load_and_"#as, i32>; defm : FlatAtomicPat <"FLAT_ATOMIC_SMAX", "atomic_load_max_"#as, i32>; defm : FlatAtomicPat <"FLAT_ATOMIC_UMAX", "atomic_load_umax_"#as, i32>; @@ -1174,8 +1198,8 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_XOR", "atomic_load_xor_"#as, i32>; defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_X2", "atomic_load_add_"#as, i64>; defm : FlatAtomicPat <"FLAT_ATOMIC_SUB_X2", "atomic_load_sub_"#as, i64>; -defm : FlatAtomicPat <"FLAT_ATOMIC_INC_X2", "atomic_inc_"#as, i64>; -defm : FlatAtomicPat <"FLAT_ATOMIC_DEC_X2", "atomic_dec_"#as, i64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_INC_X2", "atomic_load_uinc_wrap_"#as, i64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_DEC_X2", "atomic_load_udec_wrap_"#as, i64>; defm : FlatAtomicPat <"FLAT_ATOMIC_AND_X2", "atomic_load_and_"#as, i64>; defm : FlatAtomicPat <"FLAT_ATOMIC_SMAX_X2", "atomic_load_max_"#as, i64>; defm : FlatAtomicPat <"FLAT_ATOMIC_UMAX_X2", "atomic_load_umax_"#as, i64>; @@ -1429,8 +1453,8 @@ defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC", "atomic_inc_global", i32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC", "atomic_dec_global", i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC", "atomic_load_uinc_wrap_global", i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC", "atomic_load_udec_wrap_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND", "atomic_load_and_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX", "atomic_load_max_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX", "atomic_load_umax_global", i32>; @@ -1444,8 +1468,8 @@ defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC_X2", "atomic_inc_global", i64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC_X2", "atomic_dec_global", i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC_X2", "atomic_load_uinc_wrap_global", i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC_X2", "atomic_load_udec_wrap_global", i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND_X2", "atomic_load_and_global", i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX_X2", "atomic_load_max_global", i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX_X2", "atomic_load_umax_global", i64>; @@ -1459,12 +1483,23 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i let OtherPredicates = [isGFX10Plus] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", "atomic_load_fmin_global", f64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", "atomic_load_fmax_global", f64>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>; +defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>; +defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>; +} + +let OtherPredicates = [isGFX10Only] in { +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", "atomic_load_fmin_global", f64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", "atomic_load_fmax_global", f64>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN_X2", "int_amdgcn_global_atomic_fmin", f64>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", "int_amdgcn_global_atomic_fmax", f64>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN_X2", "atomic_load_fmin_flat", f64>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX_X2", "atomic_load_fmax_flat", f64>; +defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN_X2", "int_amdgcn_flat_atomic_fmin", f64>; +defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX_X2", "int_amdgcn_flat_atomic_fmax", f64>; } let OtherPredicates = [HasAtomicFaddNoRtnInsts] in { @@ -1473,7 +1508,7 @@ defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amd defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>; } -let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in { +let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in { defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>; defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>; } @@ -1484,14 +1519,17 @@ defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgc defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>; } +let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in { +defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>; +defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>; +} + let OtherPredicates = [isGFX90APlus] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>; defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>; defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>; -defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>; -defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>; defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>; defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>; @@ -1507,12 +1545,14 @@ defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32> defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", f32>; } -let OtherPredicates = [isGFX940Plus] in { +let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in { defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", v2f16>; defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>; -defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>; } +let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>; + } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in { @@ -2171,12 +2211,16 @@ class FLAT_Real_gfx11 <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> let Inst{55} = ps.sve; } -multiclass FLAT_Real_Base_gfx11<bits<7> op, string ps, string opName, int renamed = false> { +multiclass FLAT_Aliases_gfx11<string ps, string opName, int renamed> { + if renamed then + def _renamed_gfx11 : MnemonicAlias<!cast<FLAT_Pseudo>(ps).Mnemonic, opName>, Requires<[isGFX11Plus]>; +} + +multiclass FLAT_Real_Base_gfx11<bits<7> op, string ps, string opName, int renamed = false> : + FLAT_Aliases_gfx11<ps, opName, renamed> { def _gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps), opName> { let Inst{54-48} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding); } - if renamed then - def _renamed_gfx11 : MnemonicAlias<!cast<FLAT_Pseudo>(ps).Mnemonic, opName>, Requires<[isGFX11Plus]>; } multiclass FLAT_Real_RTN_gfx11<bits<7> op, string ps, string opName> { @@ -2219,7 +2263,8 @@ multiclass FLAT_Real_GlblAtomics_gfx11<bits<7> op, string ps, string opName, int FLAT_Real_RTN_gfx11<op, ps, opName>, FLAT_Real_SADDR_RTN_gfx11<op, ps, opName>; -multiclass FLAT_Real_GlblAtomics_RTN_gfx11<bits<7> op, string ps, string opName> : +multiclass FLAT_Real_GlblAtomics_RTN_gfx11<bits<7> op, string ps, string opName, int renamed = false> : + FLAT_Aliases_gfx11<ps#"_RTN", opName, renamed>, FLAT_Real_RTN_gfx11<op, ps, opName>, FLAT_Real_SADDR_RTN_gfx11<op, ps, opName>; @@ -2312,7 +2357,7 @@ defm GLOBAL_ATOMIC_SWAP_B32 : FLAT_Real_GlblAtomics_gfx11<0x033, "GLOBAL_ATO defm GLOBAL_ATOMIC_CMPSWAP_B32 : FLAT_Real_GlblAtomics_gfx11<0x034, "GLOBAL_ATOMIC_CMPSWAP", "global_atomic_cmpswap_b32", true>; defm GLOBAL_ATOMIC_ADD_U32 : FLAT_Real_GlblAtomics_gfx11<0x035, "GLOBAL_ATOMIC_ADD", "global_atomic_add_u32", true>; defm GLOBAL_ATOMIC_SUB_U32 : FLAT_Real_GlblAtomics_gfx11<0x036, "GLOBAL_ATOMIC_SUB", "global_atomic_sub_u32", true>; -defm GLOBAL_ATOMIC_CSUB_U32 : FLAT_Real_GlblAtomics_RTN_gfx11<0x037, "GLOBAL_ATOMIC_CSUB", "global_atomic_csub_u32">; +defm GLOBAL_ATOMIC_CSUB_U32 : FLAT_Real_GlblAtomics_RTN_gfx11<0x037, "GLOBAL_ATOMIC_CSUB", "global_atomic_csub_u32", true>; defm GLOBAL_ATOMIC_MIN_I32 : FLAT_Real_GlblAtomics_gfx11<0x038, "GLOBAL_ATOMIC_SMIN", "global_atomic_min_i32", true>; defm GLOBAL_ATOMIC_MIN_U32 : FLAT_Real_GlblAtomics_gfx11<0x039, "GLOBAL_ATOMIC_UMIN", "global_atomic_min_u32", true>; defm GLOBAL_ATOMIC_MAX_I32 : FLAT_Real_GlblAtomics_gfx11<0x03a, "GLOBAL_ATOMIC_SMAX", "global_atomic_max_i32", true>; diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp index f2452a275bdc..c9e0c6849568 100644 --- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp +++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp @@ -42,6 +42,16 @@ namespace { class GCNCreateVOPD : public MachineFunctionPass { private: + class VOPDCombineInfo { + public: + VOPDCombineInfo() {} + VOPDCombineInfo(MachineInstr *First, MachineInstr *Second) + : FirstMI(First), SecondMI(Second) {} + + MachineInstr *FirstMI; + MachineInstr *SecondMI; + }; + public: static char ID; const GCNSubtarget *ST = nullptr; @@ -57,10 +67,9 @@ public: return "GCN Create VOPD Instructions"; } - bool doReplace(const SIInstrInfo *SII, - std::pair<MachineInstr *, MachineInstr *> &Pair) { - auto *FirstMI = Pair.first; - auto *SecondMI = Pair.second; + bool doReplace(const SIInstrInfo *SII, VOPDCombineInfo &CI) { + auto *FirstMI = CI.FirstMI; + auto *SecondMI = CI.SecondMI; unsigned Opc1 = FirstMI->getOpcode(); unsigned Opc2 = SecondMI->getOpcode(); int NewOpcode = AMDGPU::getVOPDFull(AMDGPU::getVOPDOpcode(Opc1), @@ -94,7 +103,7 @@ public: VOPDInst.copyImplicitOps(*MI[CompIdx]); LLVM_DEBUG(dbgs() << "VOPD Fused: " << *VOPDInst << " from\tX: " - << *Pair.first << "\tY: " << *Pair.second << "\n"); + << *CI.FirstMI << "\tY: " << *CI.SecondMI << "\n"); for (auto CompIdx : VOPD::COMPONENTS) MI[CompIdx]->eraseFromParent(); @@ -114,7 +123,7 @@ public: const SIInstrInfo *SII = ST->getInstrInfo(); bool Changed = false; - SmallVector<std::pair<MachineInstr *, MachineInstr *>> ReplaceCandidates; + SmallVector<VOPDCombineInfo> ReplaceCandidates; for (auto &MBB : MF) { auto MII = MBB.begin(), E = MBB.end(); @@ -130,24 +139,24 @@ public: unsigned Opc2 = SecondMI->getOpcode(); llvm::AMDGPU::CanBeVOPD FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc); llvm::AMDGPU::CanBeVOPD SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2); - std::pair<MachineInstr *, MachineInstr *> Pair; + VOPDCombineInfo CI; if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y) - Pair = {FirstMI, SecondMI}; + CI = VOPDCombineInfo(FirstMI, SecondMI); else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X) - Pair = {SecondMI, FirstMI}; + CI = VOPDCombineInfo(SecondMI, FirstMI); else continue; // checkVOPDRegConstraints cares about program order, but doReplace // cares about X-Y order in the constituted VOPD if (llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI)) { - ReplaceCandidates.push_back(Pair); + ReplaceCandidates.push_back(CI); ++MII; } } } - for (auto &Pair : ReplaceCandidates) { - Changed |= doReplace(SII, Pair); + for (auto &CI : ReplaceCandidates) { + Changed |= doReplace(SII, CI); } return Changed; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index b33e614a071c..2d53b2a70dbe 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -16,7 +16,7 @@ #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/ScheduleDAG.h" -#include "llvm/Support/TargetParser.h" +#include "llvm/TargetParser/TargetParser.h" using namespace llvm; @@ -588,23 +588,21 @@ int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard, static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV, MCRegister Reg) { - for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) - BV.set(*RUI); + for (MCRegUnit Unit : TRI.regunits(Reg)) + BV.set(Unit); } static void addRegsToSet(const SIRegisterInfo &TRI, iterator_range<MachineInstr::const_mop_iterator> Ops, - BitVector &Set) { + BitVector &DefSet, BitVector &UseSet) { for (const MachineOperand &Op : Ops) { if (Op.isReg()) - addRegUnits(TRI, Set, Op.getReg().asMCReg()); + addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg()); } } void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { - // XXX: Do we need to worry about implicit operands - addRegsToSet(TRI, MI.defs(), ClauseDefs); - addRegsToSet(TRI, MI.uses(), ClauseUses); + addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses); } static bool breaksSMEMSoftClause(MachineInstr *MI) { @@ -1033,11 +1031,11 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { const MachineRegisterInfo &MRI = MF.getRegInfo(); int WaitStatesNeeded = 0; - for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands(); - I != E; ++I) { - const MachineOperand &Op = IA->getOperand(I); + for (const MachineOperand &Op : + llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) { if (Op.isReg() && Op.isDef()) { - WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); + WaitStatesNeeded = + std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); } } @@ -1172,7 +1170,7 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { (MI.getOpcode() == AMDGPU::S_WAITCNT && !MI.getOperand(0).getImm()) || (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && - MI.getOperand(0).getImm() == 0xffe3); + AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0); }; if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == @@ -1182,7 +1180,7 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) { const SIInstrInfo *TII = ST.getInstrInfo(); BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(0xffe3); + .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); return true; } @@ -1295,7 +1293,7 @@ bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { return true; } if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && - (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe) + AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0) return true; return false; }; @@ -1306,7 +1304,7 @@ bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) { BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(0xfffe); + .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); return true; } @@ -1454,7 +1452,7 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) { return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) || (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) || (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && - I.getOperand(0).getImm() == 0xffe3); + AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0); }; if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == @@ -1463,7 +1461,7 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) { BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(0xffe3); + .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0)); return true; } @@ -1525,7 +1523,7 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) || SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && - I.getOperand(0).getImm() == 0x0fff)) + AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0)) return HazardExpired; // Track registers writes @@ -1687,10 +1685,10 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { return false; // Hazard is observed - insert a wait on va_dst counter to ensure hazard is - // avoided (mask 0x0fff achieves this). + // avoided. BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(0x0fff); + .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0)); return true; } @@ -2026,7 +2024,7 @@ int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) { MaxWaitStates); int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates; int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); - int OpNo = MI->getOperandNo(&Op); + int OpNo = Op.getOperandNo(); if (OpNo == SrcCIdx) { NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates; } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) { @@ -2205,7 +2203,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { if (NumWaitStates == std::numeric_limits<int>::max()) continue; - int OpNo = MI->getOperandNo(&Use); + int OpNo = Use.getOperandNo(); unsigned Opc1 = MI1->getOpcode(); int NeedWaitStates = 0; if (OpNo == SrcCIdx) { @@ -2781,7 +2779,7 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) { // s_waitcnt_depctr sa_sdst(0) mitigates hazard. if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && - !(I.getOperand(0).getImm() & 0x1)) + AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0) return true; // VALU access to any SGPR or literal constant other than HazardReg @@ -2831,7 +2829,7 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) { // Add s_waitcnt_depctr sa_sdst(0) after SALU write. BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(0xfffe); + .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0)); // SALU write may be s_getpc in a bundle. if (MI->getOpcode() == AMDGPU::S_GETPC_B64) { diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 77960ef62f3a..d89c9b1febde 100644 --- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -367,9 +367,8 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule, } if (!MI->isDebugInstr()) { // Reset read - undef flags and update them later. - for (auto &Op : MI->operands()) - if (Op.isReg() && Op.isDef()) - Op.setIsUndef(false); + for (auto &Op : MI->all_defs()) + Op.setIsUndef(false); RegisterOperands RegOpers; RegOpers.collect(*MI, *TRI, MRI, /*ShouldTrackLaneMasks*/true, diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp index 366bc0a8ec0d..4c9ad9b5bcf7 100644 --- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -237,7 +237,7 @@ GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget<GCNSubtarget>(); - if (ST->getGeneration() < GCNSubtarget::GFX10) + if (!ST->hasNSAEncoding()) return false; MRI = &MF.getRegInfo(); diff --git a/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp b/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp new file mode 100644 index 000000000000..b50af38683ed --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp @@ -0,0 +1,139 @@ +//===-- GCNPreRALongBranchReg.cpp ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// \file +// \brief Pass to estimate pre RA branch size and reserve a pair of SGPRs if +// there is a long branch. Branch size at this point is difficult to track since +// we have no idea what spills will be inserted later on. We just assume 8 bytes +// per instruction to compute approximations without computing the actual +// instruction size to see if we're in the neighborhood of the maximum branch +// distrance threshold tuning of what is considered "long" is handled through +// amdgpu-long-branch-factor cl argument which sets LongBranchFactor. +//===----------------------------------------------------------------------===// +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-pre-ra-long-branch-reg" + +namespace { + +static cl::opt<double> LongBranchFactor( + "amdgpu-long-branch-factor", cl::init(1.0), cl::Hidden, + cl::desc("Factor to apply to what qualifies as a long branch " + "to reserve a pair of scalar registers. If this value " + "is 0 the long branch registers are never reserved. As this " + "value grows the greater chance the branch distance will fall " + "within the threshold and the registers will be marked to be " + "reserved. We lean towards always reserving a register for " + "long jumps")); + +class GCNPreRALongBranchReg : public MachineFunctionPass { + + struct BasicBlockInfo { + // Offset - Distance from the beginning of the function to the beginning + // of this basic block. + uint64_t Offset = 0; + // Size - Size of the basic block in bytes + uint64_t Size = 0; + }; + void generateBlockInfo(MachineFunction &MF, + SmallVectorImpl<BasicBlockInfo> &BlockInfo); + +public: + static char ID; + GCNPreRALongBranchReg() : MachineFunctionPass(ID) { + initializeGCNPreRALongBranchRegPass(*PassRegistry::getPassRegistry()); + } + bool runOnMachineFunction(MachineFunction &MF) override; + StringRef getPassName() const override { + return "AMDGPU Pre-RA Long Branch Reg"; + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +} // End anonymous namespace. +char GCNPreRALongBranchReg::ID = 0; + +INITIALIZE_PASS(GCNPreRALongBranchReg, DEBUG_TYPE, + "AMDGPU Pre-RA Long Branch Reg", false, false) + +char &llvm::GCNPreRALongBranchRegID = GCNPreRALongBranchReg::ID; +void GCNPreRALongBranchReg::generateBlockInfo( + MachineFunction &MF, SmallVectorImpl<BasicBlockInfo> &BlockInfo) { + + BlockInfo.resize(MF.getNumBlockIDs()); + + // Approximate the size of all basic blocks by just + // assuming 8 bytes per instruction + for (const MachineBasicBlock &MBB : MF) { + uint64_t NumInstr = 0; + // Loop through the basic block and add up all non-debug + // non-meta instructions + for (const MachineInstr &MI : MBB) { + // isMetaInstruction is a superset of isDebugIstr + if (MI.isMetaInstruction()) + continue; + NumInstr += 1; + } + // Approximate size as just 8 bytes per instruction + BlockInfo[MBB.getNumber()].Size = 8 * NumInstr; + } + uint64_t PrevNum = (&MF)->begin()->getNumber(); + for (auto &MBB : + make_range(std::next(MachineFunction::iterator((&MF)->begin())), + (&MF)->end())) { + uint64_t Num = MBB.getNumber(); + // Compute the offset immediately following this block. + BlockInfo[Num].Offset = BlockInfo[PrevNum].Offset + BlockInfo[PrevNum].Size; + PrevNum = Num; + } +} +bool GCNPreRALongBranchReg::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = STM.getInstrInfo(); + const SIRegisterInfo *TRI = STM.getRegisterInfo(); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // For now, reserve highest available SGPR pair. After RA, + // shift down to a lower unused pair of SGPRs + // If all registers are used, then findUnusedRegister will return + // AMDGPU::NoRegister. + constexpr bool ReserveHighestRegister = true; + Register LongBranchReservedReg = TRI->findUnusedRegister( + MRI, &AMDGPU::SGPR_64RegClass, MF, ReserveHighestRegister); + if (!LongBranchReservedReg) + return false; + + // Approximate code size and offsets of each basic block + SmallVector<BasicBlockInfo, 16> BlockInfo; + generateBlockInfo(MF, BlockInfo); + + for (const MachineBasicBlock &MBB : MF) { + MachineBasicBlock::const_iterator Last = MBB.getLastNonDebugInstr(); + if (Last == MBB.end() || !Last->isUnconditionalBranch()) + continue; + MachineBasicBlock *DestBB = TII->getBranchDestBlock(*Last); + uint64_t BlockDistance = static_cast<uint64_t>( + LongBranchFactor * BlockInfo[DestBB->getNumber()].Offset); + // If the distance falls outside the threshold assume it is a long branch + // and we need to reserve the registers + if (!TII->isBranchOffsetInRange(Last->getOpcode(), BlockDistance)) { + MFI->setLongBranchReservedReg(LongBranchReservedReg); + return true; + } + } + return false; +} diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index d86138154be6..b9c9358f88b9 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -196,6 +196,14 @@ def : ProcessorModel<"gfx940", SIDPGFX940FullSpeedModel, FeatureISAVersion9_4_0.Features >; +def : ProcessorModel<"gfx941", SIDPGFX940FullSpeedModel, + FeatureISAVersion9_4_1.Features +>; + +def : ProcessorModel<"gfx942", SIDPGFX940FullSpeedModel, + FeatureISAVersion9_4_2.Features +>; + //===----------------------------------------------------------------------===// // GCN GFX10. //===----------------------------------------------------------------------===// @@ -263,3 +271,11 @@ def : ProcessorModel<"gfx1102", GFX11SpeedModel, def : ProcessorModel<"gfx1103", GFX11SpeedModel, FeatureISAVersion11_0_3.Features >; + +def : ProcessorModel<"gfx1150", GFX11SpeedModel, + FeatureISAVersion11_5_0.Features +>; + +def : ProcessorModel<"gfx1151", GFX11SpeedModel, + FeatureISAVersion11_5_1.Features +>; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index f9bed9a76c6f..68cf97170369 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -286,8 +286,8 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { // update max pressure MaxPressure = max(AtMIPressure, MaxPressure); - for (const auto &MO : MI.operands()) { - if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual() || MO.isDead()) + for (const auto &MO : MI.all_defs()) { + if (!MO.getReg().isVirtual() || MO.isDead()) continue; auto Reg = MO.getReg(); @@ -336,23 +336,38 @@ bool GCNDownwardRPTracker::advanceBeforeNext() { assert(SI.isValid()); // Remove dead registers or mask bits. - for (auto &It : LiveRegs) { - const LiveInterval &LI = LIS.getInterval(It.first); + SmallSet<Register, 8> SeenRegs; + for (auto &MO : LastTrackedMI->operands()) { + if (!MO.isReg() || !MO.getReg().isVirtual()) + continue; + if (MO.isUse() && !MO.readsReg()) + continue; + if (!SeenRegs.insert(MO.getReg()).second) + continue; + const LiveInterval &LI = LIS.getInterval(MO.getReg()); if (LI.hasSubRanges()) { + auto It = LiveRegs.end(); for (const auto &S : LI.subranges()) { if (!S.liveAt(SI)) { - auto PrevMask = It.second; - It.second &= ~S.LaneMask; - CurPressure.inc(It.first, PrevMask, It.second, *MRI); + if (It == LiveRegs.end()) { + It = LiveRegs.find(MO.getReg()); + if (It == LiveRegs.end()) + llvm_unreachable("register isn't live"); + } + auto PrevMask = It->second; + It->second &= ~S.LaneMask; + CurPressure.inc(MO.getReg(), PrevMask, It->second, *MRI); } } + if (It != LiveRegs.end() && It->second.none()) + LiveRegs.erase(It); } else if (!LI.liveAt(SI)) { - auto PrevMask = It.second; - It.second = LaneBitmask::getNone(); - CurPressure.inc(It.first, PrevMask, It.second, *MRI); + auto It = LiveRegs.find(MO.getReg()); + if (It == LiveRegs.end()) + llvm_unreachable("register isn't live"); + CurPressure.inc(MO.getReg(), It->second, LaneBitmask::getNone(), *MRI); + LiveRegs.erase(It); } - if (It.second.none()) - LiveRegs.erase(It.first); } MaxPressure = max(MaxPressure, CurPressure); @@ -367,9 +382,7 @@ void GCNDownwardRPTracker::advanceToNext() { NextMI = skipDebugInstructionsForward(NextMI, MBBEnd); // Add new registers or mask bits. - for (const auto &MO : LastTrackedMI->operands()) { - if (!MO.isReg() || !MO.isDef()) - continue; + for (const auto &MO : LastTrackedMI->all_defs()) { Register Reg = MO.getReg(); if (!Reg.isVirtual()) continue; diff --git a/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp b/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp new file mode 100644 index 000000000000..99db7e4af9fd --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp @@ -0,0 +1,502 @@ +//===-------------- GCNRewritePartialRegUses.cpp --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// RenameIndependentSubregs pass leaves large partially used super registers, +/// for example: +/// undef %0.sub4:VReg_1024 = ... +/// %0.sub5:VReg_1024 = ... +/// %0.sub6:VReg_1024 = ... +/// %0.sub7:VReg_1024 = ... +/// use %0.sub4_sub5_sub6_sub7 +/// use %0.sub6_sub7 +/// +/// GCNRewritePartialRegUses goes right after RenameIndependentSubregs and +/// rewrites such partially used super registers with registers of minimal size: +/// undef %0.sub0:VReg_128 = ... +/// %0.sub1:VReg_128 = ... +/// %0.sub2:VReg_128 = ... +/// %0.sub3:VReg_128 = ... +/// use %0.sub0_sub1_sub2_sub3 +/// use %0.sub2_sub3 +/// +/// This allows to avoid subreg lanemasks tracking during register pressure +/// calculation and creates more possibilities for the code unaware of lanemasks +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" + +using namespace llvm; + +#define DEBUG_TYPE "rewrite-partial-reg-uses" + +namespace { + +class GCNRewritePartialRegUses : public MachineFunctionPass { +public: + static char ID; + GCNRewritePartialRegUses() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { + return "Rewrite Partial Register Uses"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addPreserved<LiveIntervals>(); + AU.addPreserved<SlotIndexes>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + MachineRegisterInfo *MRI; + const SIRegisterInfo *TRI; + const TargetInstrInfo *TII; + LiveIntervals *LIS; + + /// Rewrite partially used register Reg by shifting all its subregisters to + /// the right and replacing the original register with a register of minimal + /// size. Return true if the change has been made. + bool rewriteReg(Register Reg) const; + + /// Value type for SubRegMap below. + struct SubRegInfo { + /// Register class required to hold the value stored in the SubReg. + const TargetRegisterClass *RC; + + /// Index for the right-shifted subregister. If 0 this is the "covering" + /// subreg i.e. subreg that covers all others. Covering subreg becomes the + /// whole register after the replacement. + unsigned SubReg = AMDGPU::NoSubRegister; + SubRegInfo(const TargetRegisterClass *RC_ = nullptr) : RC(RC_) {} + }; + + /// Map OldSubReg -> { RC, NewSubReg }. Used as in/out container. + typedef SmallDenseMap<unsigned, SubRegInfo> SubRegMap; + + /// Given register class RC and the set of used subregs as keys in the SubRegs + /// map return new register class and indexes of right-shifted subregs as + /// values in SubRegs map such that the resulting regclass would contain + /// registers of minimal size. + const TargetRegisterClass *getMinSizeReg(const TargetRegisterClass *RC, + SubRegMap &SubRegs) const; + + /// Given regclass RC and pairs of [OldSubReg, SubRegRC] in SubRegs try to + /// find new regclass such that: + /// 1. It has subregs obtained by shifting each OldSubReg by RShift number + /// of bits to the right. Every "shifted" subreg should have the same + /// SubRegRC. SubRegRC can be null, in this case it initialized using + /// getSubRegisterClass. If CoverSubregIdx is not zero it's a subreg that + /// "covers" all other subregs in pairs. Basically such subreg becomes a + /// whole register. + /// 2. Resulting register class contains registers of minimal size but not + /// less than RegNumBits. + /// + /// SubRegs is map of OldSubReg -> [SubRegRC, NewSubReg] and is used as in/out + /// parameter: + /// OldSubReg - input parameter, + /// SubRegRC - in/out, should be changed for unknown regclass, + /// NewSubReg - output, contains shifted subregs on return. + const TargetRegisterClass * + getRegClassWithShiftedSubregs(const TargetRegisterClass *RC, unsigned RShift, + unsigned RegNumBits, unsigned CoverSubregIdx, + SubRegMap &SubRegs) const; + + /// Update live intervals after rewriting OldReg to NewReg with SubRegs map + /// describing OldSubReg -> NewSubReg mapping. + void updateLiveIntervals(Register OldReg, Register NewReg, + SubRegMap &SubRegs) const; + + /// Helper methods. + + /// Return reg class expected by a MO's parent instruction for a given MO. + const TargetRegisterClass *getOperandRegClass(MachineOperand &MO) const; + + /// Find right-shifted by RShift amount version of the SubReg if it exists, + /// return 0 otherwise. + unsigned shiftSubReg(unsigned SubReg, unsigned RShift) const; + + /// Find subreg index with a given Offset and Size, return 0 if there is no + /// such subregister index. The result is cached in SubRegs data-member. + unsigned getSubReg(unsigned Offset, unsigned Size) const; + + /// Cache for getSubReg method: {Offset, Size} -> SubReg index. + mutable SmallDenseMap<std::pair<unsigned, unsigned>, unsigned> SubRegs; + + /// Return bit mask that contains all register classes that are projected into + /// RC by SubRegIdx. The result is cached in SuperRegMasks data-member. + const uint32_t *getSuperRegClassMask(const TargetRegisterClass *RC, + unsigned SubRegIdx) const; + + /// Cache for getSuperRegClassMask method: { RC, SubRegIdx } -> Class bitmask. + mutable SmallDenseMap<std::pair<const TargetRegisterClass *, unsigned>, + const uint32_t *> + SuperRegMasks; + + /// Return bitmask containing all allocatable register classes with registers + /// aligned at AlignNumBits. The result is cached in + /// AllocatableAndAlignedRegClassMasks data-member. + const BitVector & + getAllocatableAndAlignedRegClassMask(unsigned AlignNumBits) const; + + /// Cache for getAllocatableAndAlignedRegClassMask method: + /// AlignNumBits -> Class bitmask. + mutable SmallDenseMap<unsigned, BitVector> AllocatableAndAlignedRegClassMasks; +}; + +} // end anonymous namespace + +// TODO: move this to the tablegen and use binary search by Offset. +unsigned GCNRewritePartialRegUses::getSubReg(unsigned Offset, + unsigned Size) const { + const auto [I, Inserted] = SubRegs.try_emplace({Offset, Size}, 0); + if (Inserted) { + for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) { + if (TRI->getSubRegIdxOffset(Idx) == Offset && + TRI->getSubRegIdxSize(Idx) == Size) { + I->second = Idx; + break; + } + } + } + return I->second; +} + +unsigned GCNRewritePartialRegUses::shiftSubReg(unsigned SubReg, + unsigned RShift) const { + unsigned Offset = TRI->getSubRegIdxOffset(SubReg) - RShift; + return getSubReg(Offset, TRI->getSubRegIdxSize(SubReg)); +} + +const uint32_t * +GCNRewritePartialRegUses::getSuperRegClassMask(const TargetRegisterClass *RC, + unsigned SubRegIdx) const { + const auto [I, Inserted] = + SuperRegMasks.try_emplace({RC, SubRegIdx}, nullptr); + if (Inserted) { + for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI) { + if (RCI.getSubReg() == SubRegIdx) { + I->second = RCI.getMask(); + break; + } + } + } + return I->second; +} + +const BitVector &GCNRewritePartialRegUses::getAllocatableAndAlignedRegClassMask( + unsigned AlignNumBits) const { + const auto [I, Inserted] = + AllocatableAndAlignedRegClassMasks.try_emplace(AlignNumBits); + if (Inserted) { + BitVector &BV = I->second; + BV.resize(TRI->getNumRegClasses()); + for (unsigned ClassID = 0; ClassID < TRI->getNumRegClasses(); ++ClassID) { + auto *RC = TRI->getRegClass(ClassID); + if (RC->isAllocatable() && TRI->isRegClassAligned(RC, AlignNumBits)) + BV.set(ClassID); + } + } + return I->second; +} + +const TargetRegisterClass * +GCNRewritePartialRegUses::getRegClassWithShiftedSubregs( + const TargetRegisterClass *RC, unsigned RShift, unsigned RegNumBits, + unsigned CoverSubregIdx, SubRegMap &SubRegs) const { + + unsigned RCAlign = TRI->getRegClassAlignmentNumBits(RC); + LLVM_DEBUG(dbgs() << " Shift " << RShift << ", reg align " << RCAlign + << '\n'); + + BitVector ClassMask(getAllocatableAndAlignedRegClassMask(RCAlign)); + for (auto &[OldSubReg, SRI] : SubRegs) { + auto &[SubRegRC, NewSubReg] = SRI; + + // Register class may be unknown, for example: + // undef %0.sub4:sgpr_1024 = S_MOV_B32 01 + // %0.sub5:sgpr_1024 = S_MOV_B32 02 + // %1:vreg_64 = COPY %0.sub4_sub5 + // Register classes for subregs 'sub4' and 'sub5' are known from the + // description of destination operand of S_MOV_B32 instruction but the + // class for the subreg 'sub4_sub5' isn't specified by the COPY instruction. + if (!SubRegRC) + SubRegRC = TRI->getSubRegisterClass(RC, OldSubReg); + + if (!SubRegRC) + return nullptr; + + LLVM_DEBUG(dbgs() << " " << TRI->getSubRegIndexName(OldSubReg) << ':' + << TRI->getRegClassName(SubRegRC) + << (SubRegRC->isAllocatable() ? "" : " not alloc") + << " -> "); + + if (OldSubReg == CoverSubregIdx) { + NewSubReg = AMDGPU::NoSubRegister; + LLVM_DEBUG(dbgs() << "whole reg"); + } else { + NewSubReg = shiftSubReg(OldSubReg, RShift); + if (!NewSubReg) { + LLVM_DEBUG(dbgs() << "none\n"); + return nullptr; + } + LLVM_DEBUG(dbgs() << TRI->getSubRegIndexName(NewSubReg)); + } + + const uint32_t *Mask = NewSubReg ? getSuperRegClassMask(SubRegRC, NewSubReg) + : SubRegRC->getSubClassMask(); + if (!Mask) + llvm_unreachable("no register class mask?"); + + ClassMask.clearBitsNotInMask(Mask); + // Don't try to early exit because checking if ClassMask has set bits isn't + // that cheap and we expect it to pass in most cases. + LLVM_DEBUG(dbgs() << ", num regclasses " << ClassMask.count() << '\n'); + } + + // ClassMask is the set of all register classes such that each class is + // allocatable, aligned, has all shifted subregs and each subreg has required + // register class (see SubRegRC above). Now select first (that is largest) + // register class with registers of minimal but not less than RegNumBits size. + // We have to check register size because we may encounter classes of smaller + // registers like VReg_1 in some situations. + const TargetRegisterClass *MinRC = nullptr; + unsigned MinNumBits = std::numeric_limits<unsigned>::max(); + for (unsigned ClassID : ClassMask.set_bits()) { + auto *RC = TRI->getRegClass(ClassID); + unsigned NumBits = TRI->getRegSizeInBits(*RC); + if (NumBits < MinNumBits && NumBits >= RegNumBits) { + MinNumBits = NumBits; + MinRC = RC; + } + if (MinNumBits == RegNumBits) + break; + } +#ifndef NDEBUG + if (MinRC) { + assert(MinRC->isAllocatable() && TRI->isRegClassAligned(MinRC, RCAlign)); + for (auto [SubReg, SRI] : SubRegs) + // Check that all registers in MinRC support SRI.SubReg subregister. + assert(MinRC == TRI->getSubClassWithSubReg(MinRC, SRI.SubReg)); + } +#endif + // There might be zero RShift - in this case we just trying to find smaller + // register. + return (MinRC != RC || RShift != 0) ? MinRC : nullptr; +} + +const TargetRegisterClass * +GCNRewritePartialRegUses::getMinSizeReg(const TargetRegisterClass *RC, + SubRegMap &SubRegs) const { + unsigned CoverSubreg = AMDGPU::NoSubRegister; + unsigned Offset = std::numeric_limits<unsigned>::max(); + unsigned End = 0; + for (auto [SubReg, SRI] : SubRegs) { + unsigned SubRegOffset = TRI->getSubRegIdxOffset(SubReg); + unsigned SubRegEnd = SubRegOffset + TRI->getSubRegIdxSize(SubReg); + if (SubRegOffset < Offset) { + Offset = SubRegOffset; + CoverSubreg = AMDGPU::NoSubRegister; + } + if (SubRegEnd > End) { + End = SubRegEnd; + CoverSubreg = AMDGPU::NoSubRegister; + } + if (SubRegOffset == Offset && SubRegEnd == End) + CoverSubreg = SubReg; + } + // If covering subreg is found shift everything so the covering subreg would + // be in the rightmost position. + if (CoverSubreg != AMDGPU::NoSubRegister) + return getRegClassWithShiftedSubregs(RC, Offset, End - Offset, CoverSubreg, + SubRegs); + + // Otherwise find subreg with maximum required alignment and shift it and all + // other subregs to the rightmost possible position with respect to the + // alignment. + unsigned MaxAlign = 0; + for (auto [SubReg, SRI] : SubRegs) + MaxAlign = std::max(MaxAlign, TRI->getSubRegAlignmentNumBits(RC, SubReg)); + + unsigned FirstMaxAlignedSubRegOffset = std::numeric_limits<unsigned>::max(); + for (auto [SubReg, SRI] : SubRegs) { + if (TRI->getSubRegAlignmentNumBits(RC, SubReg) != MaxAlign) + continue; + FirstMaxAlignedSubRegOffset = + std::min(FirstMaxAlignedSubRegOffset, TRI->getSubRegIdxOffset(SubReg)); + if (FirstMaxAlignedSubRegOffset == Offset) + break; + } + + unsigned NewOffsetOfMaxAlignedSubReg = + alignTo(FirstMaxAlignedSubRegOffset - Offset, MaxAlign); + + if (NewOffsetOfMaxAlignedSubReg > FirstMaxAlignedSubRegOffset) + llvm_unreachable("misaligned subreg"); + + unsigned RShift = FirstMaxAlignedSubRegOffset - NewOffsetOfMaxAlignedSubReg; + return getRegClassWithShiftedSubregs(RC, RShift, End - RShift, 0, SubRegs); +} + +// Only the subrange's lanemasks of the original interval need to be modified. +// Subrange for a covering subreg becomes the main range. +void GCNRewritePartialRegUses::updateLiveIntervals(Register OldReg, + Register NewReg, + SubRegMap &SubRegs) const { + if (!LIS->hasInterval(OldReg)) + return; + + auto &OldLI = LIS->getInterval(OldReg); + auto &NewLI = LIS->createEmptyInterval(NewReg); + + auto &Allocator = LIS->getVNInfoAllocator(); + NewLI.setWeight(OldLI.weight()); + + for (auto &SR : OldLI.subranges()) { + auto I = find_if(SubRegs, [&](auto &P) { + return SR.LaneMask == TRI->getSubRegIndexLaneMask(P.first); + }); + + if (I == SubRegs.end()) { + // There might be a situation when subranges don't exactly match used + // subregs, for example: + // %120 [160r,1392r:0) 0@160r + // L000000000000C000 [160r,1392r:0) 0@160r + // L0000000000003000 [160r,1392r:0) 0@160r + // L0000000000000C00 [160r,1392r:0) 0@160r + // L0000000000000300 [160r,1392r:0) 0@160r + // L0000000000000003 [160r,1104r:0) 0@160r + // L000000000000000C [160r,1104r:0) 0@160r + // L0000000000000030 [160r,1104r:0) 0@160r + // L00000000000000C0 [160r,1104r:0) 0@160r + // but used subregs are: + // sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, L000000000000FFFF + // sub0_sub1_sub2_sub3, L00000000000000FF + // sub4_sub5_sub6_sub7, L000000000000FF00 + // In this example subregs sub0_sub1_sub2_sub3 and sub4_sub5_sub6_sub7 + // have several subranges with the same lifetime. For such cases just + // recreate the interval. + LIS->removeInterval(OldReg); + LIS->removeInterval(NewReg); + LIS->createAndComputeVirtRegInterval(NewReg); + return; + } + + if (unsigned NewSubReg = I->second.SubReg) + NewLI.createSubRangeFrom(Allocator, + TRI->getSubRegIndexLaneMask(NewSubReg), SR); + else // This is the covering subreg (0 index) - set it as main range. + NewLI.assign(SR, Allocator); + + SubRegs.erase(I); + } + if (NewLI.empty()) + NewLI.assign(OldLI, Allocator); + NewLI.verify(MRI); + LIS->removeInterval(OldReg); +} + +const TargetRegisterClass * +GCNRewritePartialRegUses::getOperandRegClass(MachineOperand &MO) const { + MachineInstr *MI = MO.getParent(); + return TII->getRegClass(TII->get(MI->getOpcode()), MI->getOperandNo(&MO), TRI, + *MI->getParent()->getParent()); +} + +bool GCNRewritePartialRegUses::rewriteReg(Register Reg) const { + auto Range = MRI->reg_nodbg_operands(Reg); + if (Range.begin() == Range.end()) + return false; + + for (MachineOperand &MO : Range) { + if (MO.getSubReg() == AMDGPU::NoSubRegister) // Whole reg used, quit. + return false; + } + + auto *RC = MRI->getRegClass(Reg); + LLVM_DEBUG(dbgs() << "Try to rewrite partial reg " << printReg(Reg, TRI) + << ':' << TRI->getRegClassName(RC) << '\n'); + + // Collect used subregs and constrained reg classes infered from instruction + // operands. + SubRegMap SubRegs; + for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) { + assert(MO.getSubReg() != AMDGPU::NoSubRegister); + auto *OpDescRC = getOperandRegClass(MO); + const auto [I, Inserted] = SubRegs.try_emplace(MO.getSubReg(), OpDescRC); + if (!Inserted && OpDescRC) { + SubRegInfo &SRI = I->second; + SRI.RC = SRI.RC ? TRI->getCommonSubClass(SRI.RC, OpDescRC) : OpDescRC; + if (!SRI.RC) { + LLVM_DEBUG(dbgs() << " Couldn't find common target regclass\n"); + return false; + } + } + } + + auto *NewRC = getMinSizeReg(RC, SubRegs); + if (!NewRC) { + LLVM_DEBUG(dbgs() << " No improvement achieved\n"); + return false; + } + + Register NewReg = MRI->createVirtualRegister(NewRC); + LLVM_DEBUG(dbgs() << " Success " << printReg(Reg, TRI) << ':' + << TRI->getRegClassName(RC) << " -> " + << printReg(NewReg, TRI) << ':' + << TRI->getRegClassName(NewRC) << '\n'); + + for (auto &MO : make_early_inc_range(MRI->reg_operands(Reg))) { + MO.setReg(NewReg); + // Debug info can refer to the whole reg, just leave it as it is for now. + // TODO: create some DI shift expression? + if (MO.isDebug() && MO.getSubReg() == 0) + continue; + unsigned SubReg = SubRegs[MO.getSubReg()].SubReg; + MO.setSubReg(SubReg); + if (SubReg == AMDGPU::NoSubRegister && MO.isDef()) + MO.setIsUndef(false); + } + + if (LIS) + updateLiveIntervals(Reg, NewReg, SubRegs); + + return true; +} + +bool GCNRewritePartialRegUses::runOnMachineFunction(MachineFunction &MF) { + MRI = &MF.getRegInfo(); + TRI = static_cast<const SIRegisterInfo *>(MRI->getTargetRegisterInfo()); + TII = MF.getSubtarget().getInstrInfo(); + LIS = getAnalysisIfAvailable<LiveIntervals>(); + bool Changed = false; + for (size_t I = 0, E = MRI->getNumVirtRegs(); I < E; ++I) { + Changed |= rewriteReg(Register::index2VirtReg(I)); + } + return Changed; +} + +char GCNRewritePartialRegUses::ID; + +char &llvm::GCNRewritePartialRegUsesID = GCNRewritePartialRegUses::ID; + +INITIALIZE_PASS_BEGIN(GCNRewritePartialRegUses, DEBUG_TYPE, + "Rewrite Partial Register Uses", false, false) +INITIALIZE_PASS_END(GCNRewritePartialRegUses, DEBUG_TYPE, + "Rewrite Partial Register Uses", false, false) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 6946a05bc551..994cfea1fd7d 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -45,6 +45,13 @@ static cl::opt<unsigned> ScheduleMetricBias( "100 to chase the occupancy only."), cl::init(10)); +static cl::opt<bool> + RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, + cl::desc("Relax occupancy targets for kernels which are memory " + "bound (amdgpu-membound-threshold), or " + "Wave Limited (amdgpu-limit-wave-threshold)."), + cl::init(false)); + const unsigned ScheduleMetrics::ScaleFactor = 100; GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) @@ -67,7 +74,10 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { // Set the initial TargetOccupnacy to the maximum occupancy that we can // achieve for this function. This effectively sets a lower bound on the // 'Critical' register limits in the scheduler. - TargetOccupancy = MFI.getOccupancy(); + // Allow for lower occupancy targets if kernel is wave limited or memory + // bound, and using the relaxed occupancy feature. + TargetOccupancy = + RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy(); SGPRCriticalLimit = std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit); @@ -471,6 +481,12 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive( StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) { LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); + if (RelaxedOcc) { + MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy); + if (MinOccupancy != StartingOccupancy) + LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy + << ".\n"); + } } std::unique_ptr<GCNSchedStage> @@ -511,11 +527,19 @@ void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, // If the block has the only successor then live-ins of that successor are // live-outs of the current block. We can reuse calculated live set if the // successor will be sent to scheduling past current block. + + // However, due to the bug in LiveInterval analysis it may happen that two + // predecessors of the same successor block have different lane bitmasks for + // a live-out register. Workaround that by sticking to one-to-one relationship + // i.e. one predecessor with one successor block. const MachineBasicBlock *OnlySucc = nullptr; - if (MBB->succ_size() == 1 && !(*MBB->succ_begin())->empty()) { - SlotIndexes *Ind = LIS->getSlotIndexes(); - if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(*MBB->succ_begin())) - OnlySucc = *MBB->succ_begin(); + if (MBB->succ_size() == 1) { + auto *Candidate = *MBB->succ_begin(); + if (!Candidate->empty() && Candidate->pred_size() == 1) { + SlotIndexes *Ind = LIS->getSlotIndexes(); + if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate)) + OnlySucc = Candidate; + } } // Scheduler sends regions from the end of the block upwards. @@ -864,7 +888,8 @@ void GCNSchedStage::setupNewBlock() { DAG.startBlock(CurrentMBB); // Get real RP for the region if it hasn't be calculated before. After the // initial schedule stage real RP will be collected after scheduling. - if (StageID == GCNSchedStageID::OccInitialSchedule) + if (StageID == GCNSchedStageID::OccInitialSchedule || + StageID == GCNSchedStageID::ILPInitialSchedule) DAG.computeBlockPressure(RegionIdx, CurrentMBB); } @@ -1100,6 +1125,10 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { return true; } + // Do not attempt to relax schedule even more if we are already spilling. + if (isRegionWithExcessRP()) + return false; + LLVM_DEBUG( dbgs() << "\n\t *** In shouldRevertScheduling ***\n" @@ -1188,9 +1217,8 @@ void GCNSchedStage::revertScheduling() { } // Reset read-undef flags and update them later. - for (auto &Op : MI->operands()) - if (Op.isReg() && Op.isDef()) - Op.setIsUndef(false); + for (auto &Op : MI->all_defs()) + Op.setIsUndef(false); RegisterOperands RegOpers; RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false); if (!MI->isDebugInstr()) { @@ -1463,8 +1491,8 @@ bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) { if (!DAG.TII->isTriviallyReMaterializable(MI)) return false; - for (const MachineOperand &MO : MI.operands()) - if (MO.isReg() && MO.isUse() && MO.getReg().isVirtual()) + for (const MachineOperand &MO : MI.all_uses()) + if (MO.getReg().isVirtual()) return false; return true; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 2017ae84353c..ef5470df876d 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -15,10 +15,12 @@ #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H #include "AMDGPUCallLowering.h" +#include "AMDGPURegisterBankInfo.h" #include "AMDGPUSubtarget.h" #include "SIFrameLowering.h" #include "SIISelLowering.h" #include "SIInstrInfo.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" #define GET_SUBTARGETINFO_HEADER @@ -51,7 +53,7 @@ private: std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; std::unique_ptr<InstructionSelector> InstSelector; std::unique_ptr<LegalizerInfo> Legalizer; - std::unique_ptr<RegisterBankInfo> RegBankInfo; + std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo; protected: // Basic subtarget description. @@ -63,7 +65,6 @@ protected: unsigned MaxPrivateElementSize = 0; // Possibly statically set by tablegen, but may want to be overridden. - bool FastFMAF32 = false; bool FastDenormalF32 = false; bool HalfRate64Ops = false; bool FullRate64Ops = false; @@ -132,7 +133,7 @@ protected: bool HasA16 = false; bool HasG16 = false; bool HasNSAEncoding = false; - unsigned NSAMaxSize = 0; + bool HasPartialNSAEncoding = false; bool GFX10_AEncoding = false; bool GFX10_BEncoding = false; bool HasDLInsts = false; @@ -146,12 +147,17 @@ protected: bool HasDot7Insts = false; bool HasDot8Insts = false; bool HasDot9Insts = false; + bool HasDot10Insts = false; bool HasMAIInsts = false; bool HasFP8Insts = false; bool HasPkFmacF16Inst = false; + bool HasAtomicDsPkAdd16Insts = false; + bool HasAtomicFlatPkAdd16Insts = false; bool HasAtomicFaddRtnInsts = false; bool HasAtomicFaddNoRtnInsts = false; - bool HasAtomicPkFaddNoRtnInsts = false; + bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false; + bool HasAtomicBufferGlobalPkAddF16Insts = false; + bool HasAtomicGlobalPkAddBF16Inst = false; bool HasFlatAtomicFaddF32Inst = false; bool SupportsSRAMECC = false; @@ -173,6 +179,7 @@ protected: bool ScalarFlatScratchInsts = false; bool HasArchitectedFlatScratch = false; bool EnableFlatScratch = false; + bool HasArchitectedSGPRs = false; bool AddNoCarryInsts = false; bool HasUnpackedD16VMem = false; bool LDSMisalignedBug = false; @@ -198,6 +205,7 @@ protected: bool HasMADIntraFwdBug = false; bool HasVOPDInsts = false; bool HasVALUTransUseHazard = false; + bool HasForceStoreSC0SC1 = false; // Dummy feature to use for assembler in tablegen. bool FeatureDisable = false; @@ -248,7 +256,7 @@ public: return Legalizer.get(); } - const RegisterBankInfo *getRegBankInfo() const override { + const AMDGPURegisterBankInfo *getRegBankInfo() const override { return RegBankInfo.get(); } @@ -283,7 +291,7 @@ public: /// Return the number of high bits known to be zero for a frame index. unsigned getKnownHighZeroBitsForFrameIndex() const { - return countLeadingZeros(getMaxWaveScratchSize()) + getWavefrontSizeLog2(); + return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2(); } int getLDSBankCount() const { @@ -319,10 +327,6 @@ public: return FP64; } - bool hasFastFMAF32() const { - return FastFMAF32; - } - bool hasHalfRate64Ops() const { return HalfRate64Ops; } @@ -738,6 +742,10 @@ public: return HasDot9Insts; } + bool hasDot10Insts() const { + return HasDot10Insts; + } + bool hasMAIInsts() const { return HasMAIInsts; } @@ -750,6 +758,10 @@ public: return HasPkFmacF16Inst; } + bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; } + + bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; } + bool hasAtomicFaddInsts() const { return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts; } @@ -758,7 +770,17 @@ public: bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; } - bool hasAtomicPkFaddNoRtnInsts() const { return HasAtomicPkFaddNoRtnInsts; } + bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const { + return HasAtomicBufferGlobalPkAddF16NoRtnInsts; + } + + bool hasAtomicBufferGlobalPkAddF16Insts() const { + return HasAtomicBufferGlobalPkAddF16Insts; + } + + bool hasAtomicGlobalPkAddBF16Inst() const { + return HasAtomicGlobalPkAddBF16Inst; + } bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; } @@ -924,7 +946,9 @@ public: bool hasNSAEncoding() const { return HasNSAEncoding; } - unsigned getNSAMaxSize() const { return NSAMaxSize; } + bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; } + + unsigned getNSAMaxSize() const { return AMDGPU::getNSAMaxSize(*this); } bool hasGFX10_AEncoding() const { return GFX10_AEncoding; @@ -1070,6 +1094,8 @@ public: bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; } + bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; } + bool hasVALUMaskWriteHazard() const { return getGeneration() >= GFX11; } /// Return if operations acting on VGPR tuples require even alignment. @@ -1126,6 +1152,9 @@ public: /// In this case it is readonly. bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; } + /// \returns true if the architected SGPRs are enabled. + bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; } + /// \returns true if the machine has merged shaders in which s0-s7 are /// reserved by the hardware and user SGPRs start at s8 bool hasMergedShaders() const { @@ -1323,6 +1352,14 @@ public: // \returns the number of address arguments from which to enable MIMG NSA // on supported architectures. unsigned getNSAThreshold(const MachineFunction &MF) const; + + // \returns true if the subtarget has a hazard requiring an "s_nop 0" + // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)". + bool requiresNopBeforeDeallocVGPRs() const { + // Currently all targets that support the dealloc VGPRs message also require + // the nop. + return true; + } }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp index 95ea42267ccf..29c9b9ccf276 100644 --- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp +++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp @@ -63,7 +63,7 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII, }() && "Expected FirstMI to precede SecondMI"); // Cannot pair dependent instructions for (const auto &Use : SecondMI.uses()) - if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg())) + if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg(), TRI)) return false; auto getVRegIdx = [&](unsigned OpcodeIdx, unsigned OperandIdx) { diff --git a/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td b/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td index 1f65376890da..4956a1586774 100644 --- a/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td +++ b/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td @@ -34,7 +34,7 @@ class LDSDIRe<bits<2> op, bit is_direct> : Enc32 { class LDSDIR_getIns<bit direct> { dag ret = !if(direct, (ins wait_vdst:$waitvdst), - (ins Attr:$attr, AttrChan:$attrchan, wait_vdst:$waitvdst) + (ins InterpAttr:$attr, InterpAttrChan:$attrchan, wait_vdst:$waitvdst) ); } diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp index 24c9cc2d7dd2..a1f8be403c44 100644 --- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp +++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp @@ -348,9 +348,9 @@ createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI, /// Extern function to initialize the targets for the AMDGPU backend extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() { - TargetRegistry::RegisterCustomBehaviour(getTheAMDGPUTarget(), + TargetRegistry::RegisterCustomBehaviour(getTheR600Target(), createAMDGPUCustomBehaviour); - TargetRegistry::RegisterInstrPostProcess(getTheAMDGPUTarget(), + TargetRegistry::RegisterInstrPostProcess(getTheR600Target(), createAMDGPUInstrPostProcess); TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(), diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h index 7a0d454c3578..cb1436d319c9 100644 --- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h +++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h @@ -19,7 +19,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/MCA/CustomBehaviour.h" -#include "llvm/Support/TargetParser.h" +#include "llvm/TargetParser/TargetParser.h" namespace llvm { namespace mca { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index f0653aec925d..44109b9d2919 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -19,7 +19,7 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/EndianStream.h" -#include "llvm/Support/TargetParser.h" +#include "llvm/TargetParser/TargetParser.h" using namespace llvm; using namespace llvm::AMDGPU; @@ -79,7 +79,7 @@ bool AMDGPUAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, bool AMDGPUAsmBackend::mayNeedRelaxation(const MCInst &Inst, const MCSubtargetInfo &STI) const { - if (!STI.getFeatureBits()[AMDGPU::FeatureOffset3fBug]) + if (!STI.hasFeature(AMDGPU::FeatureOffset3fBug)) return false; if (AMDGPU::getSOPPWithRelaxation(Inst.getOpcode()) >= 0) diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 066b36622a16..3f188478ca8b 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -74,9 +74,9 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, return ELF::R_AMDGPU_REL32; case FK_Data_4: case FK_SecRel_4: - return ELF::R_AMDGPU_ABS32; + return IsPCRel ? ELF::R_AMDGPU_REL32 : ELF::R_AMDGPU_ABS32; case FK_Data_8: - return ELF::R_AMDGPU_ABS64; + return IsPCRel ? ELF::R_AMDGPU_REL64 : ELF::R_AMDGPU_ABS64; } if (Fixup.getTargetKind() == AMDGPU::fixup_si_sopp_br) { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index e465267f2c20..ad55c73b22ea 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -19,7 +19,7 @@ #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/TargetParser.h" +#include "llvm/TargetParser/TargetParser.h" using namespace llvm; using namespace llvm::AMDGPU; @@ -60,11 +60,6 @@ void AMDGPUInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo, O << formatHex(MI->getOperand(OpNo).getImm() & 0xf); } -void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - O << formatHex(MI->getOperand(OpNo).getImm() & 0xff); -} - void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -105,21 +100,6 @@ void AMDGPUInstPrinter::printNamedBit(const MCInst *MI, unsigned OpNo, } } -void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printNamedBit(MI, OpNo, O, "offen"); -} - -void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printNamedBit(MI, OpNo, O, "idxen"); -} - -void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - printNamedBit(MI, OpNo, O, "addr64"); -} - void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -141,15 +121,10 @@ void AMDGPUInstPrinter::printFlatOffset(const MCInst *MI, unsigned OpNo, bool IsFlatSeg = !(Desc.TSFlags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch)); - if (IsFlatSeg) { // Unsigned offset + if (IsFlatSeg) // Unsigned offset printU16ImmDecOperand(MI, OpNo, O); - } else { // Signed offset - if (AMDGPU::isGFX10(STI)) { - O << formatDec(SignExtend32<12>(MI->getOperand(OpNo).getImm())); - } else { - O << formatDec(SignExtend32<13>(MI->getOperand(OpNo).getImm())); - } - } + else // Signed offset + O << formatDec(SignExtend32(Imm, AMDGPU::getNumFlatOffsetBits(STI))); } } @@ -196,11 +171,6 @@ void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, printU32ImmOperand(MI, OpNo, STI, O); } -void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - printNamedBit(MI, OpNo, O, "gds"); -} - void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { auto Imm = MI->getOperand(OpNo).getImm(); @@ -218,15 +188,6 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo, O << " /* unexpected cache policy bit */"; } -void AMDGPUInstPrinter::printSWZ(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { -} - -void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - printNamedBit(MI, OpNo, O, "tfe"); -} - void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { if (MI->getOperand(OpNo).getImm()) { @@ -247,16 +208,6 @@ void AMDGPUInstPrinter::printDim(const MCInst *MI, unsigned OpNo, O << Dim; } -void AMDGPUInstPrinter::printUNorm(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - printNamedBit(MI, OpNo, O, "unorm"); -} - -void AMDGPUInstPrinter::printDA(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - printNamedBit(MI, OpNo, O, "da"); -} - void AMDGPUInstPrinter::printR128A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { if (STI.hasFeature(AMDGPU::FeatureR128A16)) @@ -265,33 +216,6 @@ void AMDGPUInstPrinter::printR128A16(const MCInst *MI, unsigned OpNo, printNamedBit(MI, OpNo, O, "r128"); } -void AMDGPUInstPrinter::printA16(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - printNamedBit(MI, OpNo, O, "a16"); -} - -void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - printNamedBit(MI, OpNo, O, "lwe"); -} - -void AMDGPUInstPrinter::printD16(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - printNamedBit(MI, OpNo, O, "d16"); -} - -void AMDGPUInstPrinter::printExpCompr(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printNamedBit(MI, OpNo, O, "compr"); -} - -void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printNamedBit(MI, OpNo, O, "vm"); -} - void AMDGPUInstPrinter::printFORMAT(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -462,7 +386,7 @@ void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, else if (Imm == 0xC400) O<< "-4.0"; else if (Imm == 0x3118 && - STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) { + STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) { O << "0.15915494"; } else { uint64_t Imm16 = static_cast<uint16_t>(Imm); @@ -486,26 +410,26 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, return; } - if (Imm == FloatToBits(0.0f)) + if (Imm == llvm::bit_cast<uint32_t>(0.0f)) O << "0.0"; - else if (Imm == FloatToBits(1.0f)) + else if (Imm == llvm::bit_cast<uint32_t>(1.0f)) O << "1.0"; - else if (Imm == FloatToBits(-1.0f)) + else if (Imm == llvm::bit_cast<uint32_t>(-1.0f)) O << "-1.0"; - else if (Imm == FloatToBits(0.5f)) + else if (Imm == llvm::bit_cast<uint32_t>(0.5f)) O << "0.5"; - else if (Imm == FloatToBits(-0.5f)) + else if (Imm == llvm::bit_cast<uint32_t>(-0.5f)) O << "-0.5"; - else if (Imm == FloatToBits(2.0f)) + else if (Imm == llvm::bit_cast<uint32_t>(2.0f)) O << "2.0"; - else if (Imm == FloatToBits(-2.0f)) + else if (Imm == llvm::bit_cast<uint32_t>(-2.0f)) O << "-2.0"; - else if (Imm == FloatToBits(4.0f)) + else if (Imm == llvm::bit_cast<uint32_t>(4.0f)) O << "4.0"; - else if (Imm == FloatToBits(-4.0f)) + else if (Imm == llvm::bit_cast<uint32_t>(-4.0f)) O << "-4.0"; else if (Imm == 0x3e22f983 && - STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) + STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) O << "0.15915494"; else O << formatHex(static_cast<uint64_t>(Imm)); @@ -520,26 +444,26 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, return; } - if (Imm == DoubleToBits(0.0)) + if (Imm == llvm::bit_cast<uint64_t>(0.0)) O << "0.0"; - else if (Imm == DoubleToBits(1.0)) + else if (Imm == llvm::bit_cast<uint64_t>(1.0)) O << "1.0"; - else if (Imm == DoubleToBits(-1.0)) + else if (Imm == llvm::bit_cast<uint64_t>(-1.0)) O << "-1.0"; - else if (Imm == DoubleToBits(0.5)) + else if (Imm == llvm::bit_cast<uint64_t>(0.5)) O << "0.5"; - else if (Imm == DoubleToBits(-0.5)) + else if (Imm == llvm::bit_cast<uint64_t>(-0.5)) O << "-0.5"; - else if (Imm == DoubleToBits(2.0)) + else if (Imm == llvm::bit_cast<uint64_t>(2.0)) O << "2.0"; - else if (Imm == DoubleToBits(-2.0)) + else if (Imm == llvm::bit_cast<uint64_t>(-2.0)) O << "-2.0"; - else if (Imm == DoubleToBits(4.0)) + else if (Imm == llvm::bit_cast<uint64_t>(4.0)) O << "4.0"; - else if (Imm == DoubleToBits(-4.0)) + else if (Imm == llvm::bit_cast<uint64_t>(-4.0)) O << "-4.0"; else if (Imm == 0x3fc45f306dc9c882 && - STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) + STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) O << "0.15915494309189532"; else { assert(isUInt<32>(Imm) || isInt<32>(Imm)); @@ -597,7 +521,7 @@ void AMDGPUInstPrinter::printDefaultVccOperand(bool FirstOperand, raw_ostream &O) { if (!FirstOperand) O << ", "; - printRegOperand(STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64] + printRegOperand(STI.hasFeature(AMDGPU::FeatureWavefrontSize64) ? AMDGPU::VCC : AMDGPU::VCC_LO, O, MRI); @@ -718,7 +642,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: if (!isUInt<16>(Op.getImm()) && - STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]) { + STI.hasFeature(AMDGPU::FeatureVOP3Literal)) { printImmediate32(Op.getImm(), STI, O); break; } @@ -742,9 +666,10 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, O << formatDec(Op.getImm()); break; case MCOI::OPERAND_REGISTER: - // FIXME: This should be removed and handled somewhere else. Seems to come - // from a disassembler bug. - O << "/*invalid immediate*/"; + // Disassembler does not fail when operand should not allow immediate + // operands but decodes them into 32bit immediate operand. + printImmediate32(Op.getImm(), STI, O); + O << "/*Invalid immediate*/"; break; default: // We hit this for the immediate instruction bits that don't yet have a @@ -761,9 +686,9 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, int RCID = Desc.operands()[OpNo].RegClass; unsigned RCBits = AMDGPU::getRegBitWidth(MRI.getRegClass(RCID)); if (RCBits == 32) - printImmediate32(FloatToBits(Value), STI, O); + printImmediate32(llvm::bit_cast<uint32_t>((float)Value), STI, O); else if (RCBits == 64) - printImmediate64(DoubleToBits(Value), STI, O); + printImmediate64(llvm::bit_cast<uint64_t>(Value), STI, O); else llvm_unreachable("Invalid register class size"); } @@ -1012,16 +937,16 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, } } -void AMDGPUInstPrinter::printRowMask(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { +void AMDGPUInstPrinter::printDppRowMask(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { O << " row_mask:"; printU4ImmOperand(MI, OpNo, STI, O); } -void AMDGPUInstPrinter::printBankMask(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { +void AMDGPUInstPrinter::printDppBankMask(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { O << " bank_mask:"; printU4ImmOperand(MI, OpNo, STI, O); } @@ -1035,9 +960,8 @@ void AMDGPUInstPrinter::printDppBoundCtrl(const MCInst *MI, unsigned OpNo, } } -void AMDGPUInstPrinter::printFI(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { +void AMDGPUInstPrinter::printDppFI(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { using namespace llvm::AMDGPU::DPP; unsigned Imm = MI->getOperand(OpNo).getImm(); if (Imm == DPP_FI_1 || Imm == DPP8_FI_1) { @@ -1287,9 +1211,9 @@ void AMDGPUInstPrinter::printInterpAttrChan(const MCInst *MI, unsigned OpNum, O << '.' << "xyzw"[Chan & 0x3]; } -void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { +void AMDGPUInstPrinter::printGPRIdxMode(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { using namespace llvm::AMDGPU::VGPRIndexMode; unsigned Val = MI->getOperand(OpNo).getImm(); @@ -1338,18 +1262,6 @@ void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, O << Asm; } -void AMDGPUInstPrinter::printHigh(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printNamedBit(MI, OpNo, O, "high"); -} - -void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - printNamedBit(MI, OpNo, O, "clamp"); -} - void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -1496,7 +1408,7 @@ void AMDGPUInstPrinter::printSwizzle(const MCInst *MI, unsigned OpNo, } } -void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, +void AMDGPUInstPrinter::printSWaitCnt(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI.getCPU()); @@ -1558,7 +1470,7 @@ void AMDGPUInstPrinter::printDepCtr(const MCInst *MI, unsigned OpNo, } } -void AMDGPUInstPrinter::printDelayFlag(const MCInst *MI, unsigned OpNo, +void AMDGPUInstPrinter::printSDelayALU(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { const char *BadInstId = "/* invalid instid value */"; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index 3486cca712ae..3b14faab136b 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -38,7 +38,6 @@ public: private: void printU4ImmOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU16ImmOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); @@ -48,9 +47,6 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef BitName); - void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printFlatOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, @@ -68,34 +64,14 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printGDS(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); void printCPol(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printSWZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printDim(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printUNorm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printDA(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); void printR128A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printA16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printLWE(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printD16(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printExpCompr(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printExpVM(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); void printFORMAT(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printSymbolicFormat(const MCInst *MI, @@ -132,14 +108,14 @@ private: raw_ostream &O); void printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printRowMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); - void printBankMask(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); + void printDppRowMask(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printDppBankMask(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printDppBoundCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printFI(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); + void printDppFI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printSDWASel(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printSDWADstSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); @@ -166,8 +142,8 @@ private: void printInterpAttrChan(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printVGPRIndexMode(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); + void printGPRIdxMode(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printMemOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printBLGP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, @@ -205,12 +181,8 @@ public: protected: void printAbs(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printHigh(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); void printClamp(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printClampSI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); void printOModSI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printLiteral(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, @@ -241,11 +213,11 @@ protected: raw_ostream &O); void printSwizzle(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printWaitFlag(const MCInst *MI, unsigned OpNo, + void printSWaitCnt(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printDepCtr(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printDelayFlag(const MCInst *MI, unsigned OpNo, + void printSDelayALU(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index ded3fb7ab8d9..d539d75fdff0 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -8,9 +8,9 @@ //===----------------------------------------------------------------------===// #include "AMDGPUMCAsmInfo.h" -#include "llvm/ADT/Triple.h" -#include "llvm/MC/MCSubtargetInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/TargetParser/Triple.h" using namespace llvm; @@ -40,7 +40,7 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT, HasNoDeadStrip = true; //===--- Dwarf Emission Directives -----------------------------------===// SupportsDebugInformation = true; - UsesCFIForDebug = true; + UsesCFIWithoutEH = true; DwarfRegNumForCFI = true; UseIntegratedAssembler = false; @@ -58,11 +58,11 @@ unsigned AMDGPUMCAsmInfo::getMaxInstLength(const MCSubtargetInfo *STI) const { return MaxInstLength; // Maximum for NSA encoded images - if (STI->getFeatureBits()[AMDGPU::FeatureNSAEncoding]) + if (STI->hasFeature(AMDGPU::FeatureNSAEncoding)) return 20; // 64-bit instruction with 32-bit literal. - if (STI->getFeatureBits()[AMDGPU::FeatureVOP3Literal]) + if (STI->hasFeature(AMDGPU::FeatureVOP3Literal)) return 12; return 8; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index 93bec8aaadfd..5e77a8caa04e 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -1,4 +1,4 @@ -//===-- AMDGPUCodeEmitter.cpp - AMDGPU Code Emitter interface -------------===// +//===-- AMDGPUMCCodeEmitter.cpp - AMDGPU Code Emitter ---------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -7,14 +7,586 @@ //===----------------------------------------------------------------------===// // /// \file -/// CodeEmitter interface for SI codegen. +/// The AMDGPU code emitter produces machine code that can be executed +/// directly on the GPU device. // //===----------------------------------------------------------------------===// -#include "AMDGPUMCCodeEmitter.h" +#include "MCTargetDesc/AMDGPUFixupKinds.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIDefines.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/APInt.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/EndianStream.h" +#include "llvm/TargetParser/SubtargetFeature.h" +#include <optional> using namespace llvm; -// pin vtable to this file -void AMDGPUMCCodeEmitter::anchor() {} +namespace { +class AMDGPUMCCodeEmitter : public MCCodeEmitter { + const MCRegisterInfo &MRI; + const MCInstrInfo &MCII; + +public: + AMDGPUMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI) + : MRI(MRI), MCII(MCII) {} + + /// Encode the instruction and write it to the OS. + void encodeInstruction(const MCInst &MI, SmallVectorImpl<char> &CB, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; + + void getMachineOpValue(const MCInst &MI, const MCOperand &MO, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// Use a fixup to encode the simm16 field for SOPP branch + /// instructions. + void getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + void getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + void getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + void getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + void getAVOperandEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + +private: + uint64_t getImplicitOpSelHiEncoding(int Opcode) const; + void getMachineOpValueCommon(const MCInst &MI, const MCOperand &MO, + unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// Encode an fp or int literal. + std::optional<uint32_t> getLitEncoding(const MCOperand &MO, + const MCOperandInfo &OpInfo, + const MCSubtargetInfo &STI) const; + + void getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups, + APInt &Inst, APInt &Scratch, + const MCSubtargetInfo &STI) const; +}; + +} // end anonymous namespace + +MCCodeEmitter *llvm::createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII, + MCContext &Ctx) { + return new AMDGPUMCCodeEmitter(MCII, *Ctx.getRegisterInfo()); +} + +// Returns the encoding value to use if the given integer is an integer inline +// immediate value, or 0 if it is not. +template <typename IntTy> +static uint32_t getIntInlineImmEncoding(IntTy Imm) { + if (Imm >= 0 && Imm <= 64) + return 128 + Imm; + + if (Imm >= -16 && Imm <= -1) + return 192 + std::abs(Imm); + + return 0; +} + +static uint32_t getLit16IntEncoding(uint16_t Val, const MCSubtargetInfo &STI) { + uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val)); + return IntImm == 0 ? 255 : IntImm; +} + +static uint32_t getLit16Encoding(uint16_t Val, const MCSubtargetInfo &STI) { + uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val)); + if (IntImm != 0) + return IntImm; + + if (Val == 0x3800) // 0.5 + return 240; + + if (Val == 0xB800) // -0.5 + return 241; + + if (Val == 0x3C00) // 1.0 + return 242; + + if (Val == 0xBC00) // -1.0 + return 243; + + if (Val == 0x4000) // 2.0 + return 244; + + if (Val == 0xC000) // -2.0 + return 245; + + if (Val == 0x4400) // 4.0 + return 246; + + if (Val == 0xC400) // -4.0 + return 247; + + if (Val == 0x3118 && // 1.0 / (2.0 * pi) + STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) + return 248; + + return 255; +} + +static uint32_t getLit32Encoding(uint32_t Val, const MCSubtargetInfo &STI) { + uint32_t IntImm = getIntInlineImmEncoding(static_cast<int32_t>(Val)); + if (IntImm != 0) + return IntImm; + + if (Val == llvm::bit_cast<uint32_t>(0.5f)) + return 240; + + if (Val == llvm::bit_cast<uint32_t>(-0.5f)) + return 241; + + if (Val == llvm::bit_cast<uint32_t>(1.0f)) + return 242; + + if (Val == llvm::bit_cast<uint32_t>(-1.0f)) + return 243; + + if (Val == llvm::bit_cast<uint32_t>(2.0f)) + return 244; + + if (Val == llvm::bit_cast<uint32_t>(-2.0f)) + return 245; + + if (Val == llvm::bit_cast<uint32_t>(4.0f)) + return 246; + + if (Val == llvm::bit_cast<uint32_t>(-4.0f)) + return 247; + + if (Val == 0x3e22f983 && // 1.0 / (2.0 * pi) + STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) + return 248; + + return 255; +} + +static uint32_t getLit64Encoding(uint64_t Val, const MCSubtargetInfo &STI) { + uint32_t IntImm = getIntInlineImmEncoding(static_cast<int64_t>(Val)); + if (IntImm != 0) + return IntImm; + + if (Val == llvm::bit_cast<uint64_t>(0.5)) + return 240; + + if (Val == llvm::bit_cast<uint64_t>(-0.5)) + return 241; + + if (Val == llvm::bit_cast<uint64_t>(1.0)) + return 242; + + if (Val == llvm::bit_cast<uint64_t>(-1.0)) + return 243; + + if (Val == llvm::bit_cast<uint64_t>(2.0)) + return 244; + + if (Val == llvm::bit_cast<uint64_t>(-2.0)) + return 245; + + if (Val == llvm::bit_cast<uint64_t>(4.0)) + return 246; + + if (Val == llvm::bit_cast<uint64_t>(-4.0)) + return 247; + + if (Val == 0x3fc45f306dc9c882 && // 1.0 / (2.0 * pi) + STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) + return 248; + + return 255; +} + +std::optional<uint32_t> +AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO, + const MCOperandInfo &OpInfo, + const MCSubtargetInfo &STI) const { + int64_t Imm; + if (MO.isExpr()) { + const auto *C = dyn_cast<MCConstantExpr>(MO.getExpr()); + if (!C) + return 255; + + Imm = C->getValue(); + } else { + + assert(!MO.isDFPImm()); + + if (!MO.isImm()) + return {}; + + Imm = MO.getImm(); + } + + switch (OpInfo.OperandType) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_AC_INT32: + case AMDGPU::OPERAND_REG_INLINE_AC_FP32: + case AMDGPU::OPERAND_REG_IMM_V2INT32: + case AMDGPU::OPERAND_REG_IMM_V2FP32: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: + return getLit32Encoding(static_cast<uint32_t>(Imm), STI); + + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_AC_FP64: + return getLit64Encoding(static_cast<uint64_t>(Imm), STI); + + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI); + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: + // FIXME Is this correct? What do inline immediates do on SI for f16 src + // which does not have f16 support? + return getLit16Encoding(static_cast<uint16_t>(Imm), STI); + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: { + if (!isUInt<16>(Imm) && STI.hasFeature(AMDGPU::FeatureVOP3Literal)) + return getLit32Encoding(static_cast<uint32_t>(Imm), STI); + if (OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) + return getLit16Encoding(static_cast<uint16_t>(Imm), STI); + [[fallthrough]]; + } + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI); + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { + uint16_t Lo16 = static_cast<uint16_t>(Imm); + uint32_t Encoding = getLit16Encoding(Lo16, STI); + return Encoding; + } + case AMDGPU::OPERAND_KIMM32: + case AMDGPU::OPERAND_KIMM16: + return MO.getImm(); + default: + llvm_unreachable("invalid operand size"); + } +} + +uint64_t AMDGPUMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const { + using namespace AMDGPU::VOP3PEncoding; + using namespace AMDGPU::OpName; + + if (AMDGPU::hasNamedOperand(Opcode, op_sel_hi)) { + if (AMDGPU::hasNamedOperand(Opcode, src2)) + return 0; + if (AMDGPU::hasNamedOperand(Opcode, src1)) + return OP_SEL_HI_2; + if (AMDGPU::hasNamedOperand(Opcode, src0)) + return OP_SEL_HI_1 | OP_SEL_HI_2; + } + return OP_SEL_HI_0 | OP_SEL_HI_1 | OP_SEL_HI_2; +} + +static bool isVCMPX64(const MCInstrDesc &Desc) { + return (Desc.TSFlags & SIInstrFlags::VOP3) && + Desc.hasImplicitDefOfPhysReg(AMDGPU::EXEC); +} + +void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI, + SmallVectorImpl<char> &CB, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + int Opcode = MI.getOpcode(); + APInt Encoding, Scratch; + getBinaryCodeForInstr(MI, Fixups, Encoding, Scratch, STI); + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + unsigned bytes = Desc.getSize(); + + // Set unused op_sel_hi bits to 1 for VOP3P and MAI instructions. + // Note that accvgpr_read/write are MAI, have src0, but do not use op_sel. + if ((Desc.TSFlags & SIInstrFlags::VOP3P) || + Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi || + Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) { + Encoding |= getImplicitOpSelHiEncoding(Opcode); + } + + // GFX10+ v_cmpx opcodes promoted to VOP3 have implied dst=EXEC. + // Documentation requires dst to be encoded as EXEC (0x7E), + // but it looks like the actual value encoded for dst operand + // is ignored by HW. It was decided to define dst as "do not care" + // in td files to allow disassembler accept any dst value. + // However, dst is encoded as EXEC for compatibility with SP3. + if (AMDGPU::isGFX10Plus(STI) && isVCMPX64(Desc)) { + assert((Encoding & 0xFF) == 0); + Encoding |= MRI.getEncodingValue(AMDGPU::EXEC_LO); + } + + for (unsigned i = 0; i < bytes; i++) { + CB.push_back((uint8_t)Encoding.extractBitsAsZExtValue(8, 8 * i)); + } + + // NSA encoding. + if (AMDGPU::isGFX10Plus(STI) && Desc.TSFlags & SIInstrFlags::MIMG) { + int vaddr0 = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::vaddr0); + int srsrc = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::srsrc); + assert(vaddr0 >= 0 && srsrc > vaddr0); + unsigned NumExtraAddrs = srsrc - vaddr0 - 1; + unsigned NumPadding = (-NumExtraAddrs) & 3; + + for (unsigned i = 0; i < NumExtraAddrs; ++i) { + getMachineOpValue(MI, MI.getOperand(vaddr0 + 1 + i), Encoding, Fixups, + STI); + CB.push_back((uint8_t)Encoding.getLimitedValue()); + } + CB.append(NumPadding, 0); + } + + if ((bytes > 8 && STI.hasFeature(AMDGPU::FeatureVOP3Literal)) || + (bytes > 4 && !STI.hasFeature(AMDGPU::FeatureVOP3Literal))) + return; + + // Do not print literals from SISrc Operands for insts with mandatory literals + if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::imm)) + return; + + // Check for additional literals + for (unsigned i = 0, e = Desc.getNumOperands(); i < e; ++i) { + + // Check if this operand should be encoded as [SV]Src + if (!AMDGPU::isSISrcOperand(Desc, i)) + continue; + + // Is this operand a literal immediate? + const MCOperand &Op = MI.getOperand(i); + auto Enc = getLitEncoding(Op, Desc.operands()[i], STI); + if (!Enc || *Enc != 255) + continue; + + // Yes! Encode it + int64_t Imm = 0; + + if (Op.isImm()) + Imm = Op.getImm(); + else if (Op.isExpr()) { + if (const auto *C = dyn_cast<MCConstantExpr>(Op.getExpr())) + Imm = C->getValue(); + + } else if (!Op.isExpr()) // Exprs will be replaced with a fixup value. + llvm_unreachable("Must be immediate or expr"); + + support::endian::write<uint32_t>(CB, Imm, support::endianness::little); + + // Only one literal value allowed + break; + } +} + +void AMDGPUMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, + APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpNo); + + if (MO.isExpr()) { + const MCExpr *Expr = MO.getExpr(); + MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br; + Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc())); + Op = APInt::getZero(96); + } else { + getMachineOpValue(MI, MO, Op, Fixups, STI); + } +} + +void AMDGPUMCCodeEmitter::getSMEMOffsetEncoding( + const MCInst &MI, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { + auto Offset = MI.getOperand(OpNo).getImm(); + // VI only supports 20-bit unsigned offsets. + assert(!AMDGPU::isVI(STI) || isUInt<20>(Offset)); + Op = Offset; +} + +void AMDGPUMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, + APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + using namespace AMDGPU::SDWA; + + uint64_t RegEnc = 0; + + const MCOperand &MO = MI.getOperand(OpNo); + + if (MO.isReg()) { + unsigned Reg = MO.getReg(); + RegEnc |= MRI.getEncodingValue(Reg); + RegEnc &= SDWA9EncValues::SRC_VGPR_MASK; + if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) { + RegEnc |= SDWA9EncValues::SRC_SGPR_MASK; + } + Op = RegEnc; + return; + } else { + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + auto Enc = getLitEncoding(MO, Desc.operands()[OpNo], STI); + if (Enc && *Enc != 255) { + Op = *Enc | SDWA9EncValues::SRC_SGPR_MASK; + return; + } + } + + llvm_unreachable("Unsupported operand kind"); +} + +void AMDGPUMCCodeEmitter::getSDWAVopcDstEncoding( + const MCInst &MI, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { + using namespace AMDGPU::SDWA; + + uint64_t RegEnc = 0; + + const MCOperand &MO = MI.getOperand(OpNo); + + unsigned Reg = MO.getReg(); + if (Reg != AMDGPU::VCC && Reg != AMDGPU::VCC_LO) { + RegEnc |= MRI.getEncodingValue(Reg); + RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK; + RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK; + } + Op = RegEnc; +} + +void AMDGPUMCCodeEmitter::getAVOperandEncoding( + const MCInst &MI, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { + unsigned Reg = MI.getOperand(OpNo).getReg(); + uint64_t Enc = MRI.getEncodingValue(Reg); + + // VGPR and AGPR have the same encoding, but SrcA and SrcB operands of mfma + // instructions use acc[0:1] modifier bits to distinguish. These bits are + // encoded as a virtual 9th bit of the register for these operands. + if (MRI.getRegClass(AMDGPU::AGPR_32RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_96RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_160RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_224RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_288RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_320RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_352RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_384RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(Reg) || + MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg)) + Enc |= 512; + + Op = Enc; +} + +static bool needsPCRel(const MCExpr *Expr) { + switch (Expr->getKind()) { + case MCExpr::SymbolRef: { + auto *SE = cast<MCSymbolRefExpr>(Expr); + MCSymbolRefExpr::VariantKind Kind = SE->getKind(); + return Kind != MCSymbolRefExpr::VK_AMDGPU_ABS32_LO && + Kind != MCSymbolRefExpr::VK_AMDGPU_ABS32_HI; + } + case MCExpr::Binary: { + auto *BE = cast<MCBinaryExpr>(Expr); + if (BE->getOpcode() == MCBinaryExpr::Sub) + return false; + return needsPCRel(BE->getLHS()) || needsPCRel(BE->getRHS()); + } + case MCExpr::Unary: + return needsPCRel(cast<MCUnaryExpr>(Expr)->getSubExpr()); + case MCExpr::Target: + case MCExpr::Constant: + return false; + } + llvm_unreachable("invalid kind"); +} + +void AMDGPUMCCodeEmitter::getMachineOpValue(const MCInst &MI, + const MCOperand &MO, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + if (MO.isReg()){ + Op = MRI.getEncodingValue(MO.getReg()); + return; + } + unsigned OpNo = &MO - MI.begin(); + getMachineOpValueCommon(MI, MO, OpNo, Op, Fixups, STI); +} + +void AMDGPUMCCodeEmitter::getMachineOpValueCommon( + const MCInst &MI, const MCOperand &MO, unsigned OpNo, APInt &Op, + SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { + + if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) { + // FIXME: If this is expression is PCRel or not should not depend on what + // the expression looks like. Given that this is just a general expression, + // it should probably be FK_Data_4 and whatever is producing + // + // s_add_u32 s2, s2, (extern_const_addrspace+16 + // + // And expecting a PCRel should instead produce + // + // .Ltmp1: + // s_add_u32 s2, s2, (extern_const_addrspace+16)-.Ltmp1 + MCFixupKind Kind; + if (needsPCRel(MO.getExpr())) + Kind = FK_PCRel_4; + else + Kind = FK_Data_4; + + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + uint32_t Offset = Desc.getSize(); + assert(Offset == 4 || Offset == 8); + + Fixups.push_back(MCFixup::create(Offset, MO.getExpr(), Kind, MI.getLoc())); + } + + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + if (AMDGPU::isSISrcOperand(Desc, OpNo)) { + if (auto Enc = getLitEncoding(MO, Desc.operands()[OpNo], STI)) { + Op = *Enc; + return; + } + } else if (MO.isImm()) { + Op = MO.getImm(); + return; + } + + llvm_unreachable("Encoding of this operand type is not supported yet."); +} + +#include "AMDGPUGenMCCodeEmitter.inc" diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h deleted file mode 100644 index 200c9b8726e2..000000000000 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ /dev/null @@ -1,68 +0,0 @@ -//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// CodeEmitter interface for SI codegen. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H -#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H - -#include "llvm/ADT/APInt.h" -#include "llvm/MC/MCCodeEmitter.h" - -namespace llvm { - -class MCInst; -class MCInstrInfo; -class MCOperand; -class MCSubtargetInfo; - -class AMDGPUMCCodeEmitter : public MCCodeEmitter { - virtual void anchor(); - -protected: - const MCInstrInfo &MCII; - - AMDGPUMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {} - -public: - void getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups, - APInt &Inst, APInt &Scratch, - const MCSubtargetInfo &STI) const; - - virtual void getMachineOpValue(const MCInst &MI, const MCOperand &MO, - APInt &Op, SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const = 0; - - virtual void getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const = 0; - - virtual void getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const = 0; - - virtual void getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const = 0; - - virtual void getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, - APInt &Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const = 0; - - virtual void getAVOperandEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const = 0; -}; - -} // End namespace llvm - -#endif diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 8a9fea3c8d26..a6a01479b5b1 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -150,8 +150,9 @@ static MCInstrAnalysis *createAMDGPUMCInstrAnalysis(const MCInstrInfo *Info) { extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMC() { TargetRegistry::RegisterMCInstrInfo(getTheGCNTarget(), createAMDGPUMCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(getTheAMDGPUTarget(), createR600MCInstrInfo); - for (Target *T : {&getTheAMDGPUTarget(), &getTheGCNTarget()}) { + TargetRegistry::RegisterMCInstrInfo(getTheR600Target(), + createR600MCInstrInfo); + for (Target *T : {&getTheR600Target(), &getTheGCNTarget()}) { RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T); TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo); @@ -163,14 +164,14 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMC() { } // R600 specific registration - TargetRegistry::RegisterMCCodeEmitter(getTheAMDGPUTarget(), + TargetRegistry::RegisterMCCodeEmitter(getTheR600Target(), createR600MCCodeEmitter); TargetRegistry::RegisterObjectTargetStreamer( - getTheAMDGPUTarget(), createAMDGPUObjectTargetStreamer); + getTheR600Target(), createAMDGPUObjectTargetStreamer); // GCN specific registration TargetRegistry::RegisterMCCodeEmitter(getTheGCNTarget(), - createSIMCCodeEmitter); + createAMDGPUMCCodeEmitter); TargetRegistry::RegisterAsmTargetStreamer(getTheGCNTarget(), createAMDGPUAsmTargetStreamer); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index c2e2563c3989..006115ba14fc 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -32,8 +32,8 @@ enum AMDGPUDwarfFlavour : unsigned { Wave64 = 0, Wave32 = 1 }; MCRegisterInfo *createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour); -MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII, - MCContext &Ctx); +MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII, + MCContext &Ctx); MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCSubtargetInfo &STI, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 7a4af1af33d6..1bd3cdc67800 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -26,7 +26,7 @@ #include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/Casting.h" #include "llvm/Support/FormattedStream.h" -#include "llvm/Support/TargetParser.h" +#include "llvm/TargetParser/TargetParser.h" using namespace llvm; using namespace llvm::AMDGPU; @@ -107,6 +107,8 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A: AK = GK_GFX90A; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: AK = GK_GFX90C; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940: AK = GK_GFX940; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX941: AK = GK_GFX941; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942: AK = GK_GFX942; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break; @@ -122,6 +124,8 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1101: AK = GK_GFX1101; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1102: AK = GK_GFX1102; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103: AK = GK_GFX1103; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150: AK = GK_GFX1150; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151: AK = GK_GFX1151; break; case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break; } @@ -176,6 +180,8 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { case GK_GFX90A: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A; case GK_GFX90C: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C; case GK_GFX940: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX940; + case GK_GFX941: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX941; + case GK_GFX942: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX942; case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010; case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011; case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012; @@ -191,6 +197,8 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { case GK_GFX1101: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1101; case GK_GFX1102: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1102; case GK_GFX1103: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103; + case GK_GFX1150: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150; + case GK_GFX1151: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151; case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE; } @@ -320,7 +328,7 @@ bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) { void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR, - bool ReserveVCC, bool ReserveFlatScr) { + bool ReserveVCC, bool ReserveFlatScr, unsigned CodeObjectVersion) { IsaVersion IVersion = getIsaVersion(STI.getCPU()); OS << "\t.amdhsa_kernel " << KernelName << '\n'; @@ -367,7 +375,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD, kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32); - if (AMDGPU::getAmdhsaCodeObjectVersion() >= 5) + if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5) PRINT_FIELD(OS, ".amdhsa_uses_dynamic_stack", KD, kernel_code_properties, amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK); PRINT_FIELD(OS, @@ -407,19 +415,17 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( if (IVersion.Major >= 7 && !ReserveFlatScr && !hasArchitectedFlatScratch(STI)) OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n'; - if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(&STI)) { - switch (*HsaAbiVer) { - default: - break; - case ELF::ELFABIVERSION_AMDGPU_HSA_V2: - break; - case ELF::ELFABIVERSION_AMDGPU_HSA_V3: - case ELF::ELFABIVERSION_AMDGPU_HSA_V4: - case ELF::ELFABIVERSION_AMDGPU_HSA_V5: - if (getTargetID()->isXnackSupported()) - OS << "\t\t.amdhsa_reserve_xnack_mask " << getTargetID()->isXnackOnOrAny() << '\n'; - break; - } + switch (CodeObjectVersion) { + default: + break; + case AMDGPU::AMDHSA_COV2: + break; + case AMDGPU::AMDHSA_COV3: + case AMDGPU::AMDHSA_COV4: + case AMDGPU::AMDHSA_COV5: + if (getTargetID()->isXnackSupported()) + OS << "\t\t.amdhsa_reserve_xnack_mask " << getTargetID()->isXnackOnOrAny() << '\n'; + break; } PRINT_FIELD(OS, ".amdhsa_float_round_mode_32", KD, @@ -850,7 +856,8 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) { void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, - uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) { + uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, + unsigned CodeObjectVersion) { auto &Streamer = getStreamer(); auto &Context = Streamer.getContext(); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index 50511794a013..db43de8fcc5f 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -93,7 +93,8 @@ public: virtual void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, - uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr){}; + uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, + unsigned CodeObjectVersion){}; static StringRef getArchNameFromElfMach(unsigned ElfMach); static unsigned getElfMach(StringRef GPU); @@ -104,12 +105,15 @@ public: std::optional<AMDGPU::IsaInfo::AMDGPUTargetID> &getTargetID() { return TargetID; } - void initializeTargetID(const MCSubtargetInfo &STI) { + void initializeTargetID(const MCSubtargetInfo &STI, + unsigned CodeObjectVersion) { assert(TargetID == std::nullopt && "TargetID can only be initialized once"); TargetID.emplace(STI); + getTargetID()->setCodeObjectVersion(CodeObjectVersion); } - void initializeTargetID(const MCSubtargetInfo &STI, StringRef FeatureString) { - initializeTargetID(STI); + void initializeTargetID(const MCSubtargetInfo &STI, StringRef FeatureString, + unsigned CodeObjectVersion) { + initializeTargetID(STI, CodeObjectVersion); assert(getTargetID() != std::nullopt && "TargetID is None"); getTargetID()->setTargetIDFromFeaturesString(FeatureString); @@ -153,7 +157,8 @@ public: void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, - uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override; + uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, + unsigned CodeObjectVersion) override; }; class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer { @@ -213,7 +218,8 @@ public: void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, - uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override; + uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, + unsigned CodeObjectVersion) override; }; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp index f77ed1faf029..22d0594e2b86 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp @@ -97,7 +97,7 @@ void R600InstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, assert(Op.isImm() || Op.isExpr()); if (Op.isImm()) { int64_t Imm = Op.getImm(); - O << Imm << '(' << BitsToFloat(Imm) << ')'; + O << Imm << '(' << llvm::bit_cast<float>(static_cast<uint32_t>(Imm)) << ')'; } if (Op.isExpr()) { Op.getExpr()->print(O << '@', &MAI); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index 3d926e52c368..bbbfbe4faa0f 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -21,8 +21,8 @@ #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/SubtargetFeature.h" #include "llvm/Support/EndianStream.h" +#include "llvm/TargetParser/SubtargetFeature.h" using namespace llvm; @@ -39,7 +39,7 @@ public: R600MCCodeEmitter &operator=(const R600MCCodeEmitter &) = delete; /// Encode the instruction and write it to the OS. - void encodeInstruction(const MCInst &MI, raw_ostream &OS, + void encodeInstruction(const MCInst &MI, SmallVectorImpl<char> &CB, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; @@ -49,9 +49,8 @@ public: const MCSubtargetInfo &STI) const; private: - - void Emit(uint32_t value, raw_ostream &OS) const; - void Emit(uint64_t value, raw_ostream &OS) const; + void emit(uint32_t value, SmallVectorImpl<char> &CB) const; + void emit(uint64_t value, SmallVectorImpl<char> &CB) const; unsigned getHWReg(unsigned regNo) const; @@ -84,7 +83,8 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, return new R600MCCodeEmitter(MCII, *Ctx.getRegisterInfo()); } -void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, +void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, + SmallVectorImpl<char> &CB, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); @@ -97,13 +97,13 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, } else if (IS_VTX(Desc)) { uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI); uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset - if (!(STI.getFeatureBits()[R600::FeatureCaymanISA])) { + if (!(STI.hasFeature(R600::FeatureCaymanISA))) { InstWord2 |= 1 << 19; // Mega-Fetch bit } - Emit(InstWord01, OS); - Emit(InstWord2, OS); - Emit((uint32_t) 0, OS); + emit(InstWord01, CB); + emit(InstWord2, CB); + emit((uint32_t)0, CB); } else if (IS_TEX(Desc)) { int64_t Sampler = MI.getOperand(14).getImm(); @@ -125,28 +125,28 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 | Offsets[2] << 10; - Emit(Word01, OS); - Emit(Word2, OS); - Emit((uint32_t) 0, OS); + emit(Word01, CB); + emit(Word2, CB); + emit((uint32_t)0, CB); } else { uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI); - if ((STI.getFeatureBits()[R600::FeatureR600ALUInst]) && + if ((STI.hasFeature(R600::FeatureR600ALUInst)) && ((Desc.TSFlags & R600_InstFlag::OP1) || Desc.TSFlags & R600_InstFlag::OP2)) { uint64_t ISAOpCode = Inst & (0x3FFULL << 39); Inst &= ~(0x3FFULL << 39); Inst |= ISAOpCode << 1; } - Emit(Inst, OS); + emit(Inst, CB); } } -void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const { - support::endian::write(OS, Value, support::little); +void R600MCCodeEmitter::emit(uint32_t Value, SmallVectorImpl<char> &CB) const { + support::endian::write(CB, Value, support::little); } -void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const { - support::endian::write(OS, Value, support::little); +void R600MCCodeEmitter::emit(uint64_t Value, SmallVectorImpl<char> &CB) const { + support::endian::write(CB, Value, support::little); } unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp index b9ff195e0ddc..6f2ccb137235 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp @@ -13,7 +13,7 @@ #include "R600MCTargetDesc.h" #include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/SubtargetFeature.h" +#include "llvm/TargetParser/SubtargetFeature.h" using namespace llvm; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp deleted file mode 100644 index f659f08de027..000000000000 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ /dev/null @@ -1,594 +0,0 @@ -//===-- SIMCCodeEmitter.cpp - SI Code Emitter -----------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// The SI code emitter produces machine code that can be executed -/// directly on the GPU device. -// -//===----------------------------------------------------------------------===// - -#include "MCTargetDesc/AMDGPUFixupKinds.h" -#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIDefines.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "llvm/ADT/APInt.h" -#include "llvm/MC/MCCodeEmitter.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/SubtargetFeature.h" -#include "llvm/Support/Casting.h" -#include <optional> - -using namespace llvm; - -namespace { - -class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { - const MCRegisterInfo &MRI; - - /// Encode an fp or int literal - std::optional<uint32_t> getLitEncoding(const MCOperand &MO, - const MCOperandInfo &OpInfo, - const MCSubtargetInfo &STI) const; - -public: - SIMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx) - : AMDGPUMCCodeEmitter(mcii), MRI(*ctx.getRegisterInfo()) {} - SIMCCodeEmitter(const SIMCCodeEmitter &) = delete; - SIMCCodeEmitter &operator=(const SIMCCodeEmitter &) = delete; - - /// Encode the instruction and write it to the OS. - void encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const override; - - void getMachineOpValue(const MCInst &MI, const MCOperand &MO, APInt &Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const override; - - /// Use a fixup to encode the simm16 field for SOPP branch - /// instructions. - void getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const override; - - void getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const override; - - void getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const override; - - void getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const override; - - void getAVOperandEncoding(const MCInst &MI, unsigned OpNo, APInt &Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const override; - -private: - uint64_t getImplicitOpSelHiEncoding(int Opcode) const; - void getMachineOpValueCommon(const MCInst &MI, const MCOperand &MO, - unsigned OpNo, APInt &Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; -}; - -} // end anonymous namespace - -MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII, - MCContext &Ctx) { - return new SIMCCodeEmitter(MCII, Ctx); -} - -// Returns the encoding value to use if the given integer is an integer inline -// immediate value, or 0 if it is not. -template <typename IntTy> -static uint32_t getIntInlineImmEncoding(IntTy Imm) { - if (Imm >= 0 && Imm <= 64) - return 128 + Imm; - - if (Imm >= -16 && Imm <= -1) - return 192 + std::abs(Imm); - - return 0; -} - -static uint32_t getLit16IntEncoding(uint16_t Val, const MCSubtargetInfo &STI) { - uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val)); - return IntImm == 0 ? 255 : IntImm; -} - -static uint32_t getLit16Encoding(uint16_t Val, const MCSubtargetInfo &STI) { - uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val)); - if (IntImm != 0) - return IntImm; - - if (Val == 0x3800) // 0.5 - return 240; - - if (Val == 0xB800) // -0.5 - return 241; - - if (Val == 0x3C00) // 1.0 - return 242; - - if (Val == 0xBC00) // -1.0 - return 243; - - if (Val == 0x4000) // 2.0 - return 244; - - if (Val == 0xC000) // -2.0 - return 245; - - if (Val == 0x4400) // 4.0 - return 246; - - if (Val == 0xC400) // -4.0 - return 247; - - if (Val == 0x3118 && // 1.0 / (2.0 * pi) - STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) - return 248; - - return 255; -} - -static uint32_t getLit32Encoding(uint32_t Val, const MCSubtargetInfo &STI) { - uint32_t IntImm = getIntInlineImmEncoding(static_cast<int32_t>(Val)); - if (IntImm != 0) - return IntImm; - - if (Val == FloatToBits(0.5f)) - return 240; - - if (Val == FloatToBits(-0.5f)) - return 241; - - if (Val == FloatToBits(1.0f)) - return 242; - - if (Val == FloatToBits(-1.0f)) - return 243; - - if (Val == FloatToBits(2.0f)) - return 244; - - if (Val == FloatToBits(-2.0f)) - return 245; - - if (Val == FloatToBits(4.0f)) - return 246; - - if (Val == FloatToBits(-4.0f)) - return 247; - - if (Val == 0x3e22f983 && // 1.0 / (2.0 * pi) - STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) - return 248; - - return 255; -} - -static uint32_t getLit64Encoding(uint64_t Val, const MCSubtargetInfo &STI) { - uint32_t IntImm = getIntInlineImmEncoding(static_cast<int64_t>(Val)); - if (IntImm != 0) - return IntImm; - - if (Val == DoubleToBits(0.5)) - return 240; - - if (Val == DoubleToBits(-0.5)) - return 241; - - if (Val == DoubleToBits(1.0)) - return 242; - - if (Val == DoubleToBits(-1.0)) - return 243; - - if (Val == DoubleToBits(2.0)) - return 244; - - if (Val == DoubleToBits(-2.0)) - return 245; - - if (Val == DoubleToBits(4.0)) - return 246; - - if (Val == DoubleToBits(-4.0)) - return 247; - - if (Val == 0x3fc45f306dc9c882 && // 1.0 / (2.0 * pi) - STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) - return 248; - - return 255; -} - -std::optional<uint32_t> -SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, - const MCOperandInfo &OpInfo, - const MCSubtargetInfo &STI) const { - int64_t Imm; - if (MO.isExpr()) { - const auto *C = dyn_cast<MCConstantExpr>(MO.getExpr()); - if (!C) - return 255; - - Imm = C->getValue(); - } else { - - assert(!MO.isDFPImm()); - - if (!MO.isImm()) - return {}; - - Imm = MO.getImm(); - } - - switch (OpInfo.OperandType) { - case AMDGPU::OPERAND_REG_IMM_INT32: - case AMDGPU::OPERAND_REG_IMM_FP32: - case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED: - case AMDGPU::OPERAND_REG_INLINE_C_INT32: - case AMDGPU::OPERAND_REG_INLINE_C_FP32: - case AMDGPU::OPERAND_REG_INLINE_AC_INT32: - case AMDGPU::OPERAND_REG_INLINE_AC_FP32: - case AMDGPU::OPERAND_REG_IMM_V2INT32: - case AMDGPU::OPERAND_REG_IMM_V2FP32: - case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP32: - return getLit32Encoding(static_cast<uint32_t>(Imm), STI); - - case AMDGPU::OPERAND_REG_IMM_INT64: - case AMDGPU::OPERAND_REG_IMM_FP64: - case AMDGPU::OPERAND_REG_INLINE_C_INT64: - case AMDGPU::OPERAND_REG_INLINE_C_FP64: - case AMDGPU::OPERAND_REG_INLINE_AC_FP64: - return getLit64Encoding(static_cast<uint64_t>(Imm), STI); - - case AMDGPU::OPERAND_REG_IMM_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_AC_INT16: - return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI); - case AMDGPU::OPERAND_REG_IMM_FP16: - case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_FP16: - // FIXME Is this correct? What do inline immediates do on SI for f16 src - // which does not have f16 support? - return getLit16Encoding(static_cast<uint16_t>(Imm), STI); - case AMDGPU::OPERAND_REG_IMM_V2INT16: - case AMDGPU::OPERAND_REG_IMM_V2FP16: { - if (!isUInt<16>(Imm) && STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]) - return getLit32Encoding(static_cast<uint32_t>(Imm), STI); - if (OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) - return getLit16Encoding(static_cast<uint16_t>(Imm), STI); - [[fallthrough]]; - } - case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: - return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI); - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { - uint16_t Lo16 = static_cast<uint16_t>(Imm); - uint32_t Encoding = getLit16Encoding(Lo16, STI); - return Encoding; - } - case AMDGPU::OPERAND_KIMM32: - case AMDGPU::OPERAND_KIMM16: - return MO.getImm(); - default: - llvm_unreachable("invalid operand size"); - } -} - -uint64_t SIMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const { - using namespace AMDGPU::VOP3PEncoding; - using namespace AMDGPU::OpName; - - if (AMDGPU::hasNamedOperand(Opcode, op_sel_hi)) { - if (AMDGPU::hasNamedOperand(Opcode, src2)) - return 0; - if (AMDGPU::hasNamedOperand(Opcode, src1)) - return OP_SEL_HI_2; - if (AMDGPU::hasNamedOperand(Opcode, src0)) - return OP_SEL_HI_1 | OP_SEL_HI_2; - } - return OP_SEL_HI_0 | OP_SEL_HI_1 | OP_SEL_HI_2; -} - -static bool isVCMPX64(const MCInstrDesc &Desc) { - return (Desc.TSFlags & SIInstrFlags::VOP3) && - Desc.hasImplicitDefOfPhysReg(AMDGPU::EXEC); -} - -void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - int Opcode = MI.getOpcode(); - APInt Encoding, Scratch; - getBinaryCodeForInstr(MI, Fixups, Encoding, Scratch, STI); - const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); - unsigned bytes = Desc.getSize(); - - // Set unused op_sel_hi bits to 1 for VOP3P and MAI instructions. - // Note that accvgpr_read/write are MAI, have src0, but do not use op_sel. - if ((Desc.TSFlags & SIInstrFlags::VOP3P) || - Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi || - Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) { - Encoding |= getImplicitOpSelHiEncoding(Opcode); - } - - // GFX10+ v_cmpx opcodes promoted to VOP3 have implied dst=EXEC. - // Documentation requires dst to be encoded as EXEC (0x7E), - // but it looks like the actual value encoded for dst operand - // is ignored by HW. It was decided to define dst as "do not care" - // in td files to allow disassembler accept any dst value. - // However, dst is encoded as EXEC for compatibility with SP3. - if (AMDGPU::isGFX10Plus(STI) && isVCMPX64(Desc)) { - assert((Encoding & 0xFF) == 0); - Encoding |= MRI.getEncodingValue(AMDGPU::EXEC_LO); - } - - for (unsigned i = 0; i < bytes; i++) { - OS.write((uint8_t)Encoding.extractBitsAsZExtValue(8, 8 * i)); - } - - // NSA encoding. - if (AMDGPU::isGFX10Plus(STI) && Desc.TSFlags & SIInstrFlags::MIMG) { - int vaddr0 = AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::vaddr0); - int srsrc = AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::srsrc); - assert(vaddr0 >= 0 && srsrc > vaddr0); - unsigned NumExtraAddrs = srsrc - vaddr0 - 1; - unsigned NumPadding = (-NumExtraAddrs) & 3; - - for (unsigned i = 0; i < NumExtraAddrs; ++i) { - getMachineOpValue(MI, MI.getOperand(vaddr0 + 1 + i), Encoding, Fixups, - STI); - OS.write((uint8_t)Encoding.getLimitedValue()); - } - for (unsigned i = 0; i < NumPadding; ++i) - OS.write(0); - } - - if ((bytes > 8 && STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]) || - (bytes > 4 && !STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal])) - return; - - // Do not print literals from SISrc Operands for insts with mandatory literals - if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::imm)) - return; - - // Check for additional literals - for (unsigned i = 0, e = Desc.getNumOperands(); i < e; ++i) { - - // Check if this operand should be encoded as [SV]Src - if (!AMDGPU::isSISrcOperand(Desc, i)) - continue; - - // Is this operand a literal immediate? - const MCOperand &Op = MI.getOperand(i); - auto Enc = getLitEncoding(Op, Desc.operands()[i], STI); - if (!Enc || *Enc != 255) - continue; - - // Yes! Encode it - int64_t Imm = 0; - - if (Op.isImm()) - Imm = Op.getImm(); - else if (Op.isExpr()) { - if (const auto *C = dyn_cast<MCConstantExpr>(Op.getExpr())) - Imm = C->getValue(); - - } else if (!Op.isExpr()) // Exprs will be replaced with a fixup value. - llvm_unreachable("Must be immediate or expr"); - - for (unsigned j = 0; j < 4; j++) { - OS.write((uint8_t) ((Imm >> (8 * j)) & 0xff)); - } - - // Only one literal value allowed - break; - } -} - -void SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, - APInt &Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpNo); - - if (MO.isExpr()) { - const MCExpr *Expr = MO.getExpr(); - MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br; - Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc())); - Op = APInt::getNullValue(96); - } else { - getMachineOpValue(MI, MO, Op, Fixups, STI); - } -} - -void SIMCCodeEmitter::getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, - APInt &Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - auto Offset = MI.getOperand(OpNo).getImm(); - // VI only supports 20-bit unsigned offsets. - assert(!AMDGPU::isVI(STI) || isUInt<20>(Offset)); - Op = Offset; -} - -void SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, - APInt &Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - using namespace AMDGPU::SDWA; - - uint64_t RegEnc = 0; - - const MCOperand &MO = MI.getOperand(OpNo); - - if (MO.isReg()) { - unsigned Reg = MO.getReg(); - RegEnc |= MRI.getEncodingValue(Reg); - RegEnc &= SDWA9EncValues::SRC_VGPR_MASK; - if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) { - RegEnc |= SDWA9EncValues::SRC_SGPR_MASK; - } - Op = RegEnc; - return; - } else { - const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); - auto Enc = getLitEncoding(MO, Desc.operands()[OpNo], STI); - if (Enc && *Enc != 255) { - Op = *Enc | SDWA9EncValues::SRC_SGPR_MASK; - return; - } - } - - llvm_unreachable("Unsupported operand kind"); -} - -void SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, - APInt &Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - using namespace AMDGPU::SDWA; - - uint64_t RegEnc = 0; - - const MCOperand &MO = MI.getOperand(OpNo); - - unsigned Reg = MO.getReg(); - if (Reg != AMDGPU::VCC && Reg != AMDGPU::VCC_LO) { - RegEnc |= MRI.getEncodingValue(Reg); - RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK; - RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK; - } - Op = RegEnc; -} - -void SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo, - APInt &Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - unsigned Reg = MI.getOperand(OpNo).getReg(); - uint64_t Enc = MRI.getEncodingValue(Reg); - - // VGPR and AGPR have the same encoding, but SrcA and SrcB operands of mfma - // instructions use acc[0:1] modifier bits to distinguish. These bits are - // encoded as a virtual 9th bit of the register for these operands. - if (MRI.getRegClass(AMDGPU::AGPR_32RegClassID).contains(Reg) || - MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(Reg) || - MRI.getRegClass(AMDGPU::AReg_96RegClassID).contains(Reg) || - MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(Reg) || - MRI.getRegClass(AMDGPU::AReg_160RegClassID).contains(Reg) || - MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) || - MRI.getRegClass(AMDGPU::AReg_224RegClassID).contains(Reg) || - MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) || - MRI.getRegClass(AMDGPU::AReg_288RegClassID).contains(Reg) || - MRI.getRegClass(AMDGPU::AReg_320RegClassID).contains(Reg) || - MRI.getRegClass(AMDGPU::AReg_352RegClassID).contains(Reg) || - MRI.getRegClass(AMDGPU::AReg_384RegClassID).contains(Reg) || - MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(Reg) || - MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg)) - Enc |= 512; - - Op = Enc; -} - -static bool needsPCRel(const MCExpr *Expr) { - switch (Expr->getKind()) { - case MCExpr::SymbolRef: { - auto *SE = cast<MCSymbolRefExpr>(Expr); - MCSymbolRefExpr::VariantKind Kind = SE->getKind(); - return Kind != MCSymbolRefExpr::VK_AMDGPU_ABS32_LO && - Kind != MCSymbolRefExpr::VK_AMDGPU_ABS32_HI; - } - case MCExpr::Binary: { - auto *BE = cast<MCBinaryExpr>(Expr); - if (BE->getOpcode() == MCBinaryExpr::Sub) - return false; - return needsPCRel(BE->getLHS()) || needsPCRel(BE->getRHS()); - } - case MCExpr::Unary: - return needsPCRel(cast<MCUnaryExpr>(Expr)->getSubExpr()); - case MCExpr::Target: - case MCExpr::Constant: - return false; - } - llvm_unreachable("invalid kind"); -} - -void SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, - const MCOperand &MO, APInt &Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - if (MO.isReg()){ - Op = MRI.getEncodingValue(MO.getReg()); - return; - } - unsigned OpNo = &MO - MI.begin(); - getMachineOpValueCommon(MI, MO, OpNo, Op, Fixups, STI); -} - -void SIMCCodeEmitter::getMachineOpValueCommon( - const MCInst &MI, const MCOperand &MO, unsigned OpNo, APInt &Op, - SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { - - if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) { - // FIXME: If this is expression is PCRel or not should not depend on what - // the expression looks like. Given that this is just a general expression, - // it should probably be FK_Data_4 and whatever is producing - // - // s_add_u32 s2, s2, (extern_const_addrspace+16 - // - // And expecting a PCRel should instead produce - // - // .Ltmp1: - // s_add_u32 s2, s2, (extern_const_addrspace+16)-.Ltmp1 - MCFixupKind Kind; - if (needsPCRel(MO.getExpr())) - Kind = FK_PCRel_4; - else - Kind = FK_Data_4; - - const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); - uint32_t Offset = Desc.getSize(); - assert(Offset == 4 || Offset == 8); - - Fixups.push_back(MCFixup::create(Offset, MO.getExpr(), Kind, MI.getLoc())); - } - - const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); - if (AMDGPU::isSISrcOperand(Desc, OpNo)) { - if (auto Enc = getLitEncoding(MO, Desc.operands()[OpNo], STI)) { - Op = *Enc; - return; - } - } else if (MO.isImm()) { - Op = MO.getImm(); - return; - } - - llvm_unreachable("Encoding of this operand type is not supported yet."); -} - -#include "AMDGPUGenMCCodeEmitter.inc" diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index c295b7f79442..d924f733624a 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -48,6 +48,7 @@ class MIMGBaseOpcode : PredicateControl { bit IsAtomicRet = 0; bit MSAA = 0; bit BVH = 0; + bit A16 = 0; } def MIMGBaseOpcode : GenericEnum { @@ -59,7 +60,7 @@ def MIMGBaseOpcodesTable : GenericTable { let CppTypeName = "MIMGBaseOpcodeInfo"; let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates", - "LodOrClampOrMip", "HasD16", "MSAA", "BVH"]; + "LodOrClampOrMip", "HasD16", "MSAA", "BVH", "A16"]; string TypeOf_BaseOpcode = "MIMGBaseOpcode"; let PrimaryKey = ["BaseOpcode"]; @@ -206,7 +207,6 @@ class MIMG <dag outs, string dns = ""> : MIMG_Base <outs, dns> { let hasPostISelHook = 1; - let AsmMatchConverter = "cvtMIMG"; Instruction Opcode = !cast<Instruction>(NAME); MIMGBaseOpcode BaseOpcode; @@ -235,22 +235,41 @@ def getMIMGInfo : SearchIndex { let Key = ["Opcode"]; } -// This class used to use !foldl to memoize the AddrAsmNames list. -// It turned out that that was much slower than using !filter. +class NSAHelper { + dag AddrIns; + string AddrAsm; + int NSA; +} + class MIMGNSAHelper<int num_addrs, - list<RegisterClass> addr_types=!listsplat(VGPR_32, num_addrs)> { - list<string> AddrAsmNames = - !foreach(i, !filter(i, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], - !lt(i, num_addrs)), "vaddr" # i); - dag AddrIns = !dag(ins, addr_types, AddrAsmNames); - string AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]"; + list<RegisterClass> addr_types=!listsplat(VGPR_32, num_addrs)> + : NSAHelper<> { + list<string> AddrAsmNames = !foreach(i, !range(num_addrs), "vaddr" # i); + let AddrIns = !dag(ins, addr_types, AddrAsmNames); + let AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]"; - int NSA = !if(!le(num_addrs, 1), ?, + let NSA = !if(!le(num_addrs, 1), ?, !if(!le(num_addrs, 5), 1, !if(!le(num_addrs, 9), 2, !if(!le(num_addrs, 13), 3, ?)))); } +class PartialNSAHelper<int num_addrs, int max_addr, RegisterClass LastAddrRC> + : NSAHelper<> { + + list<RegisterClass> addr_types = + !if(!ge(num_addrs, max_addr), + !listconcat(!listsplat(VGPR_32, !sub(max_addr, 1)), [LastAddrRC]), + !listsplat(VGPR_32, num_addrs)); + + int VAddrCount = !if(!gt(num_addrs, max_addr), max_addr, num_addrs); + list<string> AddrAsmNames = !foreach(i, !range(VAddrCount), "vaddr" # i); + + let AddrIns = !dag(ins, addr_types, AddrAsmNames); + let AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]"; + let NSA = 1; +} + // Base class of all pre-gfx10 MIMG instructions. class MIMG_gfx6789<bits<8> op, dag outs, string dns = ""> : MIMG<outs, dns>, MIMGe_gfx6789<op> { @@ -321,7 +340,8 @@ class MIMG_gfx11<int op, dag outs, string dns = ""> // Base class for all NSA MIMG instructions. // Note that 1-dword addresses always use non-NSA variants. class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="", - list<RegisterClass> addr_types=[]> + list<RegisterClass> addr_types=[], + RegisterClass LastAddrRC = VGPR_32> : MIMG<outs, dns>, MIMGe_gfx11<op> { let SubtargetPredicate = isGFX11Plus; let AssemblerPredicate = isGFX11Plus; @@ -329,9 +349,9 @@ class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="", let MIMGEncoding = MIMGEncGfx11NSA; let VAddrOperands = num_addrs; - MIMGNSAHelper nsah = !if(!empty(addr_types), - MIMGNSAHelper<num_addrs>, - MIMGNSAHelper<num_addrs, addr_types>); + NSAHelper nsah = !if(!empty(addr_types), + PartialNSAHelper<num_addrs, 5, LastAddrRC>, + MIMGNSAHelper<num_addrs, addr_types>); dag AddrIns = nsah.AddrIns; string AddrAsm = nsah.AddrAsm; @@ -672,7 +692,6 @@ class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterClass data_rc, RegisterClass addr_rc, string dns=""> : MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> { let Constraints = "$vdst = $vdata"; - let AsmMatchConverter = "cvtMIMGAtomic"; let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, DMask:$dmask, UNorm:$unorm, CPol:$cpol, @@ -684,7 +703,6 @@ class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterClass data_rc, RegisterClass addr_rc, string dns=""> : MIMG_gfx90a <op, (outs getLdStRegisterOperand<data_rc>.ret:$vdst), dns> { let Constraints = "$vdst = $vdata"; - let AsmMatchConverter = "cvtMIMGAtomic"; let InOperandList = (ins getLdStRegisterOperand<data_rc>.ret:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, @@ -720,7 +738,6 @@ class MIMG_Atomic_gfx10<mimgopc op, string opcode, : MIMG_gfx10<!cast<int>(op.GFX10M), (outs DataRC:$vdst), !if(enableDisasm, "AMDGPU", "")> { let Constraints = "$vdst = $vdata"; - let AsmMatchConverter = "cvtMIMGAtomic"; let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, @@ -734,7 +751,6 @@ class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode, : MIMG_nsa_gfx10<!cast<int>(op.GFX10M), (outs DataRC:$vdst), num_addrs, !if(enableDisasm, "AMDGPU", "")> { let Constraints = "$vdst = $vdata"; - let AsmMatchConverter = "cvtMIMGAtomic"; let InOperandList = !con((ins DataRC:$vdata), AddrIns, @@ -750,7 +766,6 @@ class MIMG_Atomic_gfx11<mimgopc op, string opcode, : MIMG_gfx11<!cast<int>(op.GFX11), (outs DataRC:$vdst), !if(enableDisasm, "AMDGPU", "")> { let Constraints = "$vdst = $vdata"; - let AsmMatchConverter = "cvtMIMGAtomic"; let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, @@ -764,7 +779,6 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode, : MIMG_nsa_gfx11<!cast<int>(op.GFX11), (outs DataRC:$vdst), num_addrs, !if(enableDisasm, "AMDGPU", "")> { let Constraints = "$vdst = $vdata"; - let AsmMatchConverter = "cvtMIMGAtomic"; let InOperandList = !con((ins DataRC:$vdata), AddrIns, @@ -934,8 +948,9 @@ class MIMG_Sampler_gfx11<mimgopc op, string opcode, class MIMG_Sampler_nsa_gfx11<mimgopc op, string opcode, RegisterClass DataRC, int num_addrs, - string dns=""> - : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns> { + RegisterClass LastVAddrSize, string dns=""> + : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns, [], + LastVAddrSize> { let InOperandList = !con(AddrIns, (ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol, @@ -946,29 +961,34 @@ class MIMG_Sampler_nsa_gfx11<mimgopc op, string opcode, #!if(BaseOpcode.HasD16, "$d16", ""); } -class MIMGAddrSize<int dw, bit enable_disasm> { +class MIMGAddrSize<int dw, bit enable_disasm, int AddrDW = dw> { int NumWords = dw; - RegisterClass RegClass = !if(!le(NumWords, 0), ?, - !if(!eq(NumWords, 1), VGPR_32, - !if(!eq(NumWords, 2), VReg_64, - !if(!eq(NumWords, 3), VReg_96, - !if(!eq(NumWords, 4), VReg_128, - !if(!eq(NumWords, 5), VReg_160, - !if(!eq(NumWords, 6), VReg_192, - !if(!eq(NumWords, 7), VReg_224, - !if(!le(NumWords, 8), VReg_256, - !if(!le(NumWords, 9), VReg_288, - !if(!le(NumWords, 10), VReg_320, - !if(!le(NumWords, 11), VReg_352, - !if(!le(NumWords, 12), VReg_384, - !if(!le(NumWords, 16), VReg_512, ?)))))))))))))); + RegisterClass RegClass = !if(!le(AddrDW, 0), ?, + !if(!eq(AddrDW, 1), VGPR_32, + !if(!eq(AddrDW, 2), VReg_64, + !if(!eq(AddrDW, 3), VReg_96, + !if(!eq(AddrDW, 4), VReg_128, + !if(!eq(AddrDW, 5), VReg_160, + !if(!eq(AddrDW, 6), VReg_192, + !if(!eq(AddrDW, 7), VReg_224, + !if(!eq(AddrDW, 8), VReg_256, + !if(!eq(AddrDW, 9), VReg_288, + !if(!eq(AddrDW, 10), VReg_320, + !if(!eq(AddrDW, 11), VReg_352, + !if(!eq(AddrDW, 12), VReg_384, + !if(!le(AddrDW, 16), VReg_512, ?)))))))))))))); // Whether the instruction variant with this vaddr size should be enabled for // the auto-generated disassembler. bit Disassemble = enable_disasm; } +// Returns the MIMGAddrSize with the size of last VAddr for partial NSA +class LastVAddrSize <int dw, int max_idx, bit enable_disasm> + : MIMGAddrSize<dw, enable_disasm, + !if(!gt(dw, max_idx), !sub(dw, max_idx), 0)>; + // Return whether x is in lst. class isIntInList<int x, list<int> lst> { bit ret = !foldl(0, lst, lhs, y, !or(lhs, !eq(x, y))); @@ -985,7 +1005,8 @@ class MIMGAddrSizes_dw_range<list<int> range> { int Max = !if(!empty(!tail(range)), Min, !head(!tail(range))); } -class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample, bit isG16> { +class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample, bit isG16, + int nsa_max_addr = 5> { // List of all possible numbers of address words, taking all combinations of // A16 and image dimension into account (note: no MSAA, since this is for // sample/gather ops). @@ -1031,6 +1052,21 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample, bit isG16> { !if(isIntInList<dw, AllNumAddrWords>.ret, !listconcat(lhs, [MIMGAddrSize<dw, !empty(lhs)>]), lhs)))); + + // In NSA format if there is a requirement for more VGPRs than the format + // supports, then the rest are sequential after the last one. Generate + // machine instructions for all possible number of words. The disassembler + // defaults to the largest number of arguments but no larger than max nsa + // size. List is generated with the register class needed for last vaddr since + // it is the only one that could have a register other than VGPR32. + int EnableDisasmNum = !foldl(!head(AllNumAddrWords), !tail(AllNumAddrWords), + acc, var, !if(!le(var, nsa_max_addr), var, acc)); + list<LastVAddrSize> PartialNSAInstrs = + !foldl([]<LastVAddrSize>, [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2], lhs, dw, + !if(isIntInList<dw, AllNumAddrWords>.ret, + !listconcat(lhs, [LastVAddrSize<dw, !sub(nsa_max_addr, 1), + !eq(dw, EnableDisasmNum)>]), + lhs)); } multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm, @@ -1066,9 +1102,14 @@ multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm, : MIMG_Sampler_nsa_gfx10<op, asm, dst_rc, addr.NumWords, !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>; } - if !and(op.HAS_GFX11, !le(addr.NumWords, 5)) then { + } + } + + foreach addr = MIMG_Sampler_AddrSizes<sample, isG16, 5/*MaxNSASize*/>.PartialNSAInstrs in { + let VAddrDwords = addr.NumWords in { + if op.HAS_GFX11 then { def _V # addr.NumWords # _nsa_gfx11 - : MIMG_Sampler_nsa_gfx11<op, asm, dst_rc, addr.NumWords, + : MIMG_Sampler_nsa_gfx11<op, asm, dst_rc, addr.NumWords, addr.RegClass, !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>; } } @@ -1144,51 +1185,43 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16> { [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]); } -class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC, bit IsA16> +class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC> : MIMG_gfx10<op.GFX10M, (outs VReg_128:$vdata), "AMDGPU"> { - - let InOperandList = !con((ins AddrRC:$vaddr0, SReg_128:$srsrc), - !if(IsA16, (ins A16:$a16), (ins))); - let AsmString = opcode#" $vdata, $vaddr0, $srsrc"#!if(IsA16, "$a16", ""); + let InOperandList = (ins AddrRC:$vaddr0, SReg_128:$srsrc, A16:$a16); + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$a16"; let nsa = 0; } -class MIMG_IntersectRay_nsa_gfx10<mimgopc op, string opcode, int num_addrs, bit IsA16> +class MIMG_IntersectRay_nsa_gfx10<mimgopc op, string opcode, int num_addrs> : MIMG_nsa_gfx10<op.GFX10M, (outs VReg_128:$vdata), num_addrs, "AMDGPU"> { - let InOperandList = !con(nsah.AddrIns, - (ins SReg_128:$srsrc), - !if(IsA16, (ins A16:$a16), (ins))); - let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc"#!if(IsA16, "$a16", ""); + let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$srsrc, A16:$a16)); + let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc$a16"; } -class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterClass AddrRC, bit IsA16> +class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterClass AddrRC> : MIMG_gfx11<op.GFX11, (outs VReg_128:$vdata), "AMDGPU"> { - - let InOperandList = !con((ins AddrRC:$vaddr0, SReg_128:$srsrc), - !if(IsA16, (ins A16:$a16), (ins))); - let AsmString = opcode#" $vdata, $vaddr0, $srsrc"#!if(IsA16, "$a16", ""); + let InOperandList = (ins AddrRC:$vaddr0, SReg_128:$srsrc, A16:$a16); + let AsmString = opcode#" $vdata, $vaddr0, $srsrc$a16"; let nsa = 0; } class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs, - bit IsA16, list<RegisterClass> addr_types> + list<RegisterClass> addr_types> : MIMG_nsa_gfx11<op.GFX11, (outs VReg_128:$vdata), num_addrs, "AMDGPU", addr_types> { - let InOperandList = !con(nsah.AddrIns, - (ins SReg_128:$srsrc), - !if(IsA16, (ins A16:$a16), (ins))); - let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc"#!if(IsA16, "$a16", ""); + let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$srsrc, A16:$a16)); + let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc$a16"; } multiclass MIMG_IntersectRay<mimgopc op, string opcode, bit Is64, bit IsA16> { defvar info = MIMG_IntersectRay_Helper<Is64, IsA16>; def "" : MIMGBaseOpcode { let BVH = 1; + let A16 = IsA16; } - let AsmMatchConverter = !if(IsA16, "cvtIntersectRay", ""), - dmask = 0xf, + let dmask = 0xf, unorm = 1, d16 = 0, cpol = 0, @@ -1201,17 +1234,17 @@ multiclass MIMG_IntersectRay<mimgopc op, string opcode, bit Is64, bit IsA16> { d16 = 0, BaseOpcode = !cast<MIMGBaseOpcode>(NAME), VDataDwords = 4 in { - def _sa_gfx10 : MIMG_IntersectRay_gfx10<op, opcode, info.RegClass, IsA16> { + def _sa_gfx10 : MIMG_IntersectRay_gfx10<op, opcode, info.RegClass> { let VAddrDwords = info.VAddrDwords; } - def _sa_gfx11 : MIMG_IntersectRay_gfx11<op, opcode, info.RegClass, IsA16> { + def _sa_gfx11 : MIMG_IntersectRay_gfx11<op, opcode, info.RegClass> { let VAddrDwords = info.VAddrDwords; } - def _nsa_gfx10 : MIMG_IntersectRay_nsa_gfx10<op, opcode, info.num_addrs, IsA16> { + def _nsa_gfx10 : MIMG_IntersectRay_nsa_gfx10<op, opcode, info.num_addrs> { let VAddrDwords = info.num_addrs; } def _nsa_gfx11 : MIMG_IntersectRay_nsa_gfx11<op, opcode, - info.gfx11_nsa_addrs, IsA16, + info.gfx11_nsa_addrs, info.gfx11_addr_types> { let VAddrDwords = info.num_addrs; } diff --git a/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp index 50a90dd03f38..20c2ff8a4fd7 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp @@ -20,7 +20,7 @@ namespace { class R600DAGToDAGISel : public AMDGPUDAGToDAGISel { - const R600Subtarget *Subtarget; + const R600Subtarget *Subtarget = nullptr; bool isConstantLoad(const MemSDNode *N, int cbID) const; bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue &IntPtr); diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index fad393267a71..ad072cfe23b1 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -953,10 +953,8 @@ SDValue R600TargetLowering::lowerADDRSPACECAST(SDValue Op, unsigned SrcAS = ASC->getSrcAddressSpace(); unsigned DestAS = ASC->getDestAddressSpace(); - if (auto *ConstSrc = dyn_cast<ConstantSDNode>(Op.getOperand(0))) { - if (SrcAS == AMDGPUAS::FLAT_ADDRESS && ConstSrc->isNullValue()) - return DAG.getConstant(TM.getNullPointerValue(DestAS), SL, VT); - } + if (isNullConstant(Op.getOperand(0)) && SrcAS == AMDGPUAS::FLAT_ADDRESS) + return DAG.getConstant(TM.getNullPointerValue(DestAS), SL, VT); return Op; } @@ -1656,7 +1654,7 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); for (unsigned i = 0; i < 4; i++) { unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); - if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) + if (SwizzleRemap.contains(Idx)) Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); } @@ -1664,7 +1662,7 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); for (unsigned i = 0; i < 4; i++) { unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); - if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) + if (SwizzleRemap.contains(Idx)) Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); } @@ -2182,3 +2180,18 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, return Node; } + +TargetLowering::AtomicExpansionKind +R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { + switch (RMW->getOperation()) { + case AtomicRMWInst::UIncWrap: + case AtomicRMWInst::UDecWrap: + // FIXME: Cayman at least appears to have instructions for this, but the + // instruction defintions appear to be missing. + return AtomicExpansionKind::CmpXChg; + default: + break; + } + + return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW); +} diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h index 8a5479db4ee6..fc361c01bc67 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h @@ -114,6 +114,9 @@ private: SelectionDAG &DAG) const; SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; + + TargetLowering::AtomicExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const override; }; } // End namespace llvm; diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index 4056274cd440..7f874b245b8f 100644 --- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -328,7 +328,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI, if (Reg == R600::OQAP) { Result.push_back(std::pair(Index, 0U)); } - if (PV.find(Reg) != PV.end()) { + if (PV.contains(Reg)) { // 255 is used to tells its a PS/PV reg Result.push_back(std::pair(255, 0U)); continue; diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td index b53e9c258fd9..f4dfbe8adc75 100644 --- a/llvm/lib/Target/AMDGPU/R600Instructions.td +++ b/llvm/lib/Target/AMDGPU/R600Instructions.td @@ -1090,7 +1090,7 @@ multiclass CUBE_Common <bits<11> inst> { } // End mayLoad = 0, mayStore = 0, hasSideEffects = 0 class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper < - inst, "EXP_IEEE", fexp2 + inst, "EXP_IEEE", AMDGPUexp > { let Itinerary = TransALU; } @@ -1124,7 +1124,7 @@ class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP < >; class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper < - inst, "LOG_IEEE", flog2 + inst, "LOG_IEEE", AMDGPUlog > { let Itinerary = TransALU; } diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp index c01f9c4794c7..1a1be4a44285 100644 --- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp @@ -82,10 +82,10 @@ bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); } -unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) { +unsigned R600TTIImpl::getMaxInterleaveFactor(ElementCount VF) { // Disable unrolling if the loop is not vectorized. // TODO: Enable this again. - if (VF == 1) + if (VF.isScalar()) return 1; return 8; diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h index 8dacae0abb7b..2934b0151f4d 100644 --- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h @@ -57,7 +57,7 @@ public: unsigned AddrSpace) const; bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const; - unsigned getMaxInterleaveFactor(unsigned VF); + unsigned getMaxInterleaveFactor(ElementCount VF); InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); using BaseT::getVectorInstrCost; diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index f232bc9b3852..b87cd8c66cc8 100644 --- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -13,8 +13,8 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" -#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -36,7 +36,7 @@ using StackEntry = std::pair<BasicBlock *, Value *>; using StackVector = SmallVector<StackEntry, 16>; class SIAnnotateControlFlow : public FunctionPass { - LegacyDivergenceAnalysis *DA; + UniformityInfo *UA; Type *Boolean; Type *Void; @@ -99,7 +99,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<LegacyDivergenceAnalysis>(); + AU.addRequired<UniformityInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<TargetPassConfig>(); @@ -112,7 +112,7 @@ public: INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) @@ -146,7 +146,7 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) { /// Is the branch condition uniform or did the StructurizeCFG pass /// consider it as such? bool SIAnnotateControlFlow::isUniform(BranchInst *T) { - return DA->isUniform(T) || + return UA->isUniform(T) || T->getMetadata("structurizecfg.uniform") != nullptr; } @@ -336,7 +336,7 @@ bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { bool SIAnnotateControlFlow::runOnFunction(Function &F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - DA = &getAnalysis<LegacyDivergenceAnalysis>(); + UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); const TargetMachine &TM = TPC.getTM<TargetMachine>(); diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 97a583421a7e..cd1818285e3e 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -16,11 +16,36 @@ namespace llvm { // This needs to be kept in sync with the field bits in SIRegisterClass. enum SIRCFlags : uint8_t { - // For vector registers. - HasVGPR = 1 << 0, - HasAGPR = 1 << 1, - HasSGPR = 1 << 2 -}; // enum SIRCFlags + RegTupleAlignUnitsWidth = 2, + HasVGPRBit = RegTupleAlignUnitsWidth, + HasAGPRBit, + HasSGPRbit, + + HasVGPR = 1 << HasVGPRBit, + HasAGPR = 1 << HasAGPRBit, + HasSGPR = 1 << HasSGPRbit, + + RegTupleAlignUnitsMask = (1 << RegTupleAlignUnitsWidth) - 1, + RegKindMask = (HasVGPR | HasAGPR | HasSGPR) +}; // enum SIRCFlagsr + +namespace SIEncodingFamily { +// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td +// and the columns of the getMCOpcodeGen table. +enum { + SI = 0, + VI = 1, + SDWA = 2, + SDWA9 = 3, + GFX80 = 4, + GFX9 = 5, + GFX10 = 6, + SDWA10 = 7, + GFX90A = 8, + GFX940 = 9, + GFX11 = 10, +}; +} namespace SIInstrFlags { // This needs to be kept in sync with the field bits in InstSI. @@ -133,6 +158,9 @@ enum : uint64_t { // Whether tied sources will be read. TiedSourceNotRead = UINT64_C(1) << 60, + + // Is never uniform. + IsNeverUniform = UINT64_C(1) << 61, }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. @@ -222,6 +250,7 @@ enum OperandType : unsigned { // NEG and SEXT share same bit-mask because they can't be set simultaneously. namespace SISrcMods { enum : unsigned { + NONE = 0, NEG = 1 << 0, // Floating-point negate modifier ABS = 1 << 1, // Floating-point absolute modifier SEXT = 1 << 0, // Integer sign-extend modifier @@ -333,7 +362,7 @@ enum Id { // Message ID, width(4) [3:0]. ID_SAVEWAVE = 4, // added in GFX8, removed in GFX11 ID_STALL_WAVE_GEN = 5, // added in GFX9 ID_HALT_WAVES = 6, // added in GFX9 - ID_ORDERED_PS_DONE = 7, // added in GFX9 + ID_ORDERED_PS_DONE = 7, // added in GFX9, removed in GFX11 ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10 ID_GS_ALLOC_REQ = 9, // added in GFX9 ID_GET_DOORBELL = 10, // added in GFX9, removed in GFX11 @@ -401,19 +430,26 @@ enum Id { // HwRegCode, (6) [5:0] ID_TBA_HI = 17, ID_TMA_LO = 18, ID_TMA_HI = 19, - ID_XCC_ID = 20, - ID_SQ_PERF_SNAPSHOT_DATA = 21, - ID_SQ_PERF_SNAPSHOT_DATA1 = 22, - ID_SQ_PERF_SNAPSHOT_PC_LO = 23, - ID_SQ_PERF_SNAPSHOT_PC_HI = 24, ID_FLAT_SCR_LO = 20, ID_FLAT_SCR_HI = 21, ID_XNACK_MASK = 22, ID_HW_ID1 = 23, ID_HW_ID2 = 24, ID_POPS_PACKER = 25, + ID_PERF_SNAPSHOT_DATA = 27, ID_SHADER_CYCLES = 29, + // Register numbers reused in GFX11+ + ID_PERF_SNAPSHOT_PC_LO = 18, + ID_PERF_SNAPSHOT_PC_HI = 19, + + // GFX940 specific registers + ID_XCC_ID = 20, + ID_SQ_PERF_SNAPSHOT_DATA = 21, + ID_SQ_PERF_SNAPSHOT_DATA1 = 22, + ID_SQ_PERF_SNAPSHOT_PC_LO = 23, + ID_SQ_PERF_SNAPSHOT_PC_HI = 24, + ID_SHIFT_ = 0, ID_WIDTH_ = 6, ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_) @@ -909,6 +945,17 @@ enum Offset_COV5 : unsigned { }; } // namespace ImplicitArg + +namespace VirtRegFlag { +// Virtual register flags used for various target specific handlings during +// codegen. +enum Register_Flag : uint8_t { + // Register operand in a whole-wave mode operation. + WWM_REG = 1 << 0, +}; + +} // namespace VirtRegFlag + } // namespace AMDGPU #define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index e5a028823e72..db323465c153 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -231,7 +231,7 @@ static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END) return false; - unsigned OpIdx = UseMI->getOperandNo(&MO); + unsigned OpIdx = MO.getOperandNo(); if (OpIdx >= UseMI->getDesc().getNumOperands() || !TII->isOperandLegal(*UseMI, OpIdx, &Src)) return false; @@ -658,7 +658,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { TRI->getEquivalentSGPRClass(SrcRC); Register NewDst = MRI->createVirtualRegister(DestRC); MachineBasicBlock *BlockToInsertCopy = - MI.isPHI() ? MI.getOperand(MI.getOperandNo(&MO) + 1).getMBB() + MI.isPHI() ? MI.getOperand(MO.getOperandNo() + 1).getMBB() : MBB; MachineBasicBlock::iterator PointToInsertCopy = MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I; @@ -869,7 +869,9 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, return true; } if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) { - TII->moveToVALU(MI, MDT); + SIInstrWorklist worklist; + worklist.insert(&MI); + TII->moveToVALU(worklist, MDT); return true; } @@ -991,6 +993,10 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { LoweringWorklist.push_back(C.second.ID); } + // Store all the V2S copy instructions that need to be moved to VALU + // in the Copies worklist. + SIInstrWorklist Copies; + while (!LoweringWorklist.empty()) { unsigned CurID = LoweringWorklist.pop_back_val(); auto CurInfoIt = V2SCopies.find(CurID); @@ -1013,10 +1019,13 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy << " is being turned to VALU\n"); V2SCopies.erase(C.ID); - TII->moveToVALU(*C.Copy, MDT); + Copies.insert(C.Copy); } } + TII->moveToVALU(Copies, MDT); + Copies.clear(); + // Now do actual lowering for (auto C : V2SCopies) { MachineInstr *MI = C.second.Copy; diff --git a/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp index f7e3ea5fc072..08272a9ddfd3 100644 --- a/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp @@ -31,6 +31,11 @@ public: initializeSIFixVGPRCopiesPass(*PassRegistry::getPassRegistry()); } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { return "SI Fix VGPR copies"; } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 9c0c665a318c..9f1d6038f1b6 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -111,9 +111,11 @@ public: std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const; bool tryFoldOMod(MachineInstr &MI); bool tryFoldRegSequence(MachineInstr &MI); - bool tryFoldLCSSAPhi(MachineInstr &MI); + bool tryFoldPhiAGPR(MachineInstr &MI); bool tryFoldLoad(MachineInstr &MI); + bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB); + public: SIFoldOperands() : MachineFunctionPass(ID) { initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); @@ -138,6 +140,16 @@ char SIFoldOperands::ID = 0; char &llvm::SIFoldOperandsID = SIFoldOperands::ID; +static const TargetRegisterClass *getRegOpRC(const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, + const MachineOperand &MO) { + const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg()); + if (const TargetRegisterClass *SubRC = + TRI.getSubRegisterClass(RC, MO.getSubReg())) + RC = SubRC; + return RC; +} + // Map multiply-accumulate opcode to corresponding multiply-add opcode if any. static unsigned macToMad(unsigned Opc) { switch (Opc) { @@ -341,14 +353,17 @@ bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, // Check if changing this to a v_mad_{f16, f32} instruction will allow us // to fold the operand. MI->setDesc(TII->get(NewOpc)); - if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) && - AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel)) + bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) && + AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel); + if (AddOpSel) MI->addOperand(MachineOperand::CreateImm(0)); bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold); if (FoldAsMAD) { MI->untieRegOperand(OpNo); return true; } + if (AddOpSel) + MI->removeOperand(MI->getNumExplicitOperands() - 1); MI->setDesc(TII->get(Opc)); } @@ -893,11 +908,10 @@ void SIFoldOperands::foldOperand( TRI->getRegClass(FoldDesc.operands()[0].RegClass); // Split 64-bit constants into 32-bits for folding. - if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) { + if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) { Register UseReg = UseOp.getReg(); const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg); - - if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64) + if (AMDGPU::getRegBitWidth(*UseRC) != 64) return; APInt Imm(64, OpToFold.getImm()); @@ -1628,52 +1642,175 @@ bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) { return true; } -// Try to hoist an AGPR to VGPR copy out of the loop across a LCSSA PHI. -// This should allow folding of an AGPR into a consumer which may support it. -// I.e.: -// -// loop: // loop: -// %1:vreg = COPY %0:areg // exit: -// exit: => // %1:areg = PHI %0:areg, %loop -// %2:vreg = PHI %1:vreg, %loop // %2:vreg = COPY %1:areg -bool SIFoldOperands::tryFoldLCSSAPhi(MachineInstr &PHI) { - assert(PHI.isPHI()); +/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and +/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg +static bool isAGPRCopy(const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI, const MachineInstr &Copy, + Register &OutReg, unsigned &OutSubReg) { + assert(Copy.isCopy()); - if (PHI.getNumExplicitOperands() != 3) // Single input LCSSA PHI + const MachineOperand &CopySrc = Copy.getOperand(1); + Register CopySrcReg = CopySrc.getReg(); + if (!CopySrcReg.isVirtual()) return false; - Register PhiIn = PHI.getOperand(1).getReg(); - Register PhiOut = PHI.getOperand(0).getReg(); - if (PHI.getOperand(1).getSubReg() || - !TRI->isVGPR(*MRI, PhiIn) || !TRI->isVGPR(*MRI, PhiOut)) + // Common case: copy from AGPR directly, e.g. + // %1:vgpr_32 = COPY %0:agpr_32 + if (TRI.isAGPR(MRI, CopySrcReg)) { + OutReg = CopySrcReg; + OutSubReg = CopySrc.getSubReg(); + return true; + } + + // Sometimes it can also involve two copies, e.g. + // %1:vgpr_256 = COPY %0:agpr_256 + // %2:vgpr_32 = COPY %1:vgpr_256.sub0 + const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg); + if (!CopySrcDef || !CopySrcDef->isCopy()) return false; - // A single use should not matter for correctness, but if it has another use - // inside the loop we may perform copy twice in a worst case. - if (!MRI->hasOneNonDBGUse(PhiIn)) + const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1); + Register OtherCopySrcReg = OtherCopySrc.getReg(); + if (!OtherCopySrcReg.isVirtual() || + CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister || + OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister || + !TRI.isAGPR(MRI, OtherCopySrcReg)) return false; - MachineInstr *Copy = MRI->getVRegDef(PhiIn); - if (!Copy || !Copy->isCopy()) + OutReg = OtherCopySrcReg; + OutSubReg = CopySrc.getSubReg(); + return true; +} + +// Try to hoist an AGPR to VGPR copy across a PHI. +// This should allow folding of an AGPR into a consumer which may support it. +// +// Example 1: LCSSA PHI +// loop: +// %1:vreg = COPY %0:areg +// exit: +// %2:vreg = PHI %1:vreg, %loop +// => +// loop: +// exit: +// %1:areg = PHI %0:areg, %loop +// %2:vreg = COPY %1:areg +// +// Example 2: PHI with multiple incoming values: +// entry: +// %1:vreg = GLOBAL_LOAD(..) +// loop: +// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop +// %3:areg = COPY %2:vreg +// %4:areg = (instr using %3:areg) +// %5:vreg = COPY %4:areg +// => +// entry: +// %1:vreg = GLOBAL_LOAD(..) +// %2:areg = COPY %1:vreg +// loop: +// %3:areg = PHI %2:areg, %entry, %X:areg, +// %4:areg = (instr using %3:areg) +bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr &PHI) { + assert(PHI.isPHI()); + + Register PhiOut = PHI.getOperand(0).getReg(); + if (!TRI->isVGPR(*MRI, PhiOut)) return false; - Register CopyIn = Copy->getOperand(1).getReg(); - if (!TRI->isAGPR(*MRI, CopyIn) || Copy->getOperand(1).getSubReg()) + // Iterate once over all incoming values of the PHI to check if this PHI is + // eligible, and determine the exact AGPR RC we'll target. + const TargetRegisterClass *ARC = nullptr; + for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) { + MachineOperand &MO = PHI.getOperand(K); + MachineInstr *Copy = MRI->getVRegDef(MO.getReg()); + if (!Copy || !Copy->isCopy()) + continue; + + Register AGPRSrc; + unsigned AGPRRegMask = AMDGPU::NoSubRegister; + if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask)) + continue; + + const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc); + if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask)) + CopyInRC = SubRC; + + if (ARC && !ARC->hasSubClassEq(CopyInRC)) + return false; + ARC = CopyInRC; + } + + if (!ARC) return false; - const TargetRegisterClass *ARC = MRI->getRegClass(CopyIn); + bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass); + + // Rewrite the PHI's incoming values to ARC. + LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI); + for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) { + MachineOperand &MO = PHI.getOperand(K); + Register Reg = MO.getReg(); + + MachineBasicBlock::iterator InsertPt; + MachineBasicBlock *InsertMBB = nullptr; + + // Look at the def of Reg, ignoring all copies. + unsigned CopyOpc = AMDGPU::COPY; + if (MachineInstr *Def = MRI->getVRegDef(Reg)) { + + // Look at pre-existing COPY instructions from ARC: Steal the operand. If + // the copy was single-use, it will be removed by DCE later. + if (Def->isCopy()) { + Register AGPRSrc; + unsigned AGPRSubReg = AMDGPU::NoSubRegister; + if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) { + MO.setReg(AGPRSrc); + MO.setSubReg(AGPRSubReg); + continue; + } + + // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on + // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try + // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which + // is unlikely to be profitable. + // + // Note that V_ACCVGPR_WRITE is only used for AGPR_32. + MachineOperand &CopyIn = Def->getOperand(1); + if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) && + TRI->isSGPRReg(*MRI, CopyIn.getReg())) + CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64; + } + + InsertMBB = Def->getParent(); + InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator()); + } else { + InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB(); + InsertPt = InsertMBB->getFirstTerminator(); + } + + Register NewReg = MRI->createVirtualRegister(ARC); + MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(), + TII->get(CopyOpc), NewReg) + .addReg(Reg); + MO.setReg(NewReg); + + (void)MI; + LLVM_DEBUG(dbgs() << " Created COPY: " << *MI); + } + + // Replace the PHI's result with a new register. Register NewReg = MRI->createVirtualRegister(ARC); - PHI.getOperand(1).setReg(CopyIn); PHI.getOperand(0).setReg(NewReg); + // COPY that new register back to the original PhiOut register. This COPY will + // usually be folded out later. MachineBasicBlock *MBB = PHI.getParent(); - BuildMI(*MBB, MBB->getFirstNonPHI(), Copy->getDebugLoc(), + BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(), TII->get(AMDGPU::COPY), PhiOut) - .addReg(NewReg, RegState::Kill); - Copy->eraseFromParent(); // We know this copy had a single use. - - LLVM_DEBUG(dbgs() << "Folded " << PHI); + .addReg(NewReg); + LLVM_DEBUG(dbgs() << " Done: Folded " << PHI); return true; } @@ -1733,6 +1870,101 @@ bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) { return true; } +// tryFoldPhiAGPR will aggressively try to create AGPR PHIs. +// For GFX90A and later, this is pretty much always a good thing, but for GFX908 +// there's cases where it can create a lot more AGPR-AGPR copies, which are +// expensive on this architecture due to the lack of V_ACCVGPR_MOV. +// +// This function looks at all AGPR PHIs in a basic block and collects their +// operands. Then, it checks for register that are used more than once across +// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from +// having to create one VGPR temporary per use, which can get very messy if +// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector +// element). +// +// Example +// a: +// %in:agpr_256 = COPY %foo:vgpr_256 +// c: +// %x:agpr_32 = .. +// b: +// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c +// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c +// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c +// => +// a: +// %in:agpr_256 = COPY %foo:vgpr_256 +// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32 +// %tmp_agpr:agpr_32 = COPY %tmp +// c: +// %x:agpr_32 = .. +// b: +// %0:areg = PHI %tmp_agpr, %a, %x, %c +// %1:areg = PHI %tmp_agpr, %a, %y, %c +// %2:areg = PHI %tmp_agpr, %a, %z, %c +bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) { + // This is only really needed on GFX908 where AGPR-AGPR copies are + // unreasonably difficult. + if (ST->hasGFX90AInsts()) + return false; + + // Look at all AGPR Phis and collect the register + subregister used. + DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>> + RegToMO; + + for (auto &MI : MBB) { + if (!MI.isPHI()) + break; + + if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg())) + continue; + + for (unsigned K = 1; K < MI.getNumOperands(); K += 2) { + MachineOperand &PhiMO = MI.getOperand(K); + RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO); + } + } + + // For all (Reg, SubReg) pair that are used more than once, cache the value in + // a VGPR. + bool Changed = false; + for (const auto &[Entry, MOs] : RegToMO) { + if (MOs.size() == 1) + continue; + + const auto [Reg, SubReg] = Entry; + MachineInstr *Def = MRI->getVRegDef(Reg); + MachineBasicBlock *DefMBB = Def->getParent(); + + // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded + // out. + const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front()); + Register TempVGPR = + MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC)); + MachineInstr *VGPRCopy = + BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(), + TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR) + .addReg(Reg, /* flags */ 0, SubReg); + + // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs. + Register TempAGPR = MRI->createVirtualRegister(ARC); + BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(), + TII->get(AMDGPU::COPY), TempAGPR) + .addReg(TempVGPR); + + LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy); + for (MachineOperand *MO : MOs) { + MO->setReg(TempAGPR); + MO->setSubReg(AMDGPU::NoSubRegister); + LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n"); + } + + Changed = true; + } + + return Changed; +} + bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -1766,7 +1998,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { continue; } - if (MI.isPHI() && tryFoldLCSSAPhi(MI)) { + if (MI.isPHI() && tryFoldPhiAGPR(MI)) { Changed = true; continue; } @@ -1791,6 +2023,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { !tryFoldOMod(MI)) Changed |= tryFoldClamp(MI); } + + Changed |= tryOptimizeAGPRPhis(*MBB); } return Changed; diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp index a1eb8150595f..edcfd994033e 100644 --- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -119,9 +119,7 @@ static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) { // If this is a load instruction where the result has been coalesced with an operand, then we cannot clause it. for (const MachineOperand &ResMO : MI.defs()) { Register ResReg = ResMO.getReg(); - for (const MachineOperand &MO : MI.uses()) { - if (!MO.isReg() || MO.isDef()) - continue; + for (const MachineOperand &MO : MI.all_uses()) { if (MO.getReg() == ResReg) return false; } diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index c2bc95930272..865caae240f3 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -64,9 +64,12 @@ static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, return MCRegister(); } +/// Query target location for spilling SGPRs +/// \p IncludeScratchCopy : Also look for free scratch SGPRs static void getVGPRSpillLaneOrTempRegister( MachineFunction &MF, LivePhysRegs &LiveRegs, Register SGPR, - const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass) { + const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass, + bool IncludeScratchCopy = true) { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF.getFrameInfo(); @@ -77,9 +80,12 @@ static void getVGPRSpillLaneOrTempRegister( // We need to save and restore the given SGPR. + Register ScratchSGPR; // 1: Try to save the given register into an unused scratch SGPR. The LiveRegs - // should have all the callee saved registers marked as used. - Register ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC); + // should have all the callee saved registers marked as used. For certain + // cases we skip copy to scratch SGPR. + if (IncludeScratchCopy) + ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC); if (!ScratchSGPR) { int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr, @@ -93,10 +99,10 @@ static void getVGPRSpillLaneOrTempRegister( SGPR, PrologEpilogSGPRSaveRestoreInfo( SGPRSaveKind::SPILL_TO_VGPR_LANE, FI)); - LLVM_DEBUG( - auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front(); - dbgs() << printReg(SGPR, TRI) << " requires fallback spill to " - << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';); + LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front(); + dbgs() << printReg(SGPR, TRI) << " requires fallback spill to " + << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane + << '\n';); } else { // Remove dead <FI> index MF.getFrameInfo().RemoveStackObject(FI); @@ -258,7 +264,7 @@ class PrologEpilogSGPRSpillBuilder { assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); ArrayRef<SIRegisterInfo::SpilledReg> Spill = - FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); + FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI); assert(Spill.size() == NumSubRegs); for (unsigned I = 0; I < NumSubRegs; ++I) { @@ -303,7 +309,7 @@ class PrologEpilogSGPRSpillBuilder { void restoreFromVGPRLane(const int FI) { assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); ArrayRef<SIRegisterInfo::SpilledReg> Spill = - FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); + FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI); assert(Spill.size() == NumSubRegs); for (unsigned I = 0; I < NumSubRegs; ++I) { @@ -565,7 +571,7 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( // reserved input we needed. Also for PAL, make sure we don't clobber // the GIT pointer passed in SGPR0 or SGPR8. if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && - !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) { + (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) { MRI.replaceRegWith(ScratchRsrcReg, Reg); MFI->setScratchRSrcReg(Reg); return Reg; @@ -935,8 +941,7 @@ void SIFrameLowering::emitCSRSpillStores( if (!WWMCalleeSavedRegs.empty()) { if (ScratchExecCopy) { unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); + BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); } else { ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, /*IsProlog*/ true, @@ -948,8 +953,7 @@ void SIFrameLowering::emitCSRSpillStores( if (ScratchExecCopy) { // FIXME: Split block and make terminator. unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) + BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) .addReg(ScratchExecCopy, RegState::Kill); LiveRegs.addReg(ScratchExecCopy); } @@ -1040,8 +1044,7 @@ void SIFrameLowering::emitCSRSpillRestores( if (!WWMCalleeSavedRegs.empty()) { if (ScratchExecCopy) { unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); + BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1); } else { ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL, /*IsProlog*/ false, @@ -1053,8 +1056,7 @@ void SIFrameLowering::emitCSRSpillRestores( if (ScratchExecCopy) { // FIXME: Split block and make terminator. unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) + BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec()) .addReg(ScratchExecCopy, RegState::Kill); } } @@ -1350,8 +1352,9 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, TRI->isAGPR(MRI, VReg))) { - // FIXME: change to enterBasicBlockEnd() - RS->enterBasicBlock(MBB); + assert(RS != nullptr); + RS->enterBasicBlockEnd(MBB); + RS->backward(MI); TRI->eliminateFrameIndex(MI, 0, FIOp, RS); SpillFIs.set(FI); continue; @@ -1436,20 +1439,36 @@ void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced( TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) < TRI->getHWRegIndex(VGPRForAGPRCopy))) { - // Call to setVGPRForAGPRCopy() should happen first before calling - // freezeReservedRegs() so that getReservedRegs() can reserve this newly - // identified VGPR (for AGPR copy). + // Reserve this newly identified VGPR (for AGPR copy) + // reserved registers should already be frozen at this point + // so we can avoid calling MRI.freezeReservedRegs and just use + // MRI.reserveReg FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR); - MRI.freezeReservedRegs(MF); + MRI.reserveReg(UnusedLowVGPR, TRI); } } + // We initally reserved the highest available SGPR pair for long branches + // now, after RA, we shift down to a lower unused one if one exists + Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg(); + Register UnusedLowSGPR = + TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF); + // If LongBranchReservedReg is null then we didn't find a long branch + // and never reserved a register to begin with so there is nothing to + // shift down. Then if UnusedLowSGPR is null, there isn't available lower + // register to use so just keep the original one we set. + if (LongBranchReservedReg && UnusedLowSGPR) { + FuncInfo->setLongBranchReservedReg(UnusedLowSGPR); + MRI.reserveReg(UnusedLowSGPR, TRI); + } } // The special SGPR spills like the one needed for FP, BP or any reserved // registers delayed until frame lowering. void SIFrameLowering::determinePrologEpilogSGPRSaves( - MachineFunction &MF, BitVector &SavedVGPRs) const { + MachineFunction &MF, BitVector &SavedVGPRs, + bool NeedExecCopyReservedReg) const { MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -1461,6 +1480,26 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves( for (unsigned I = 0; CSRegs[I]; ++I) LiveRegs.addReg(CSRegs[I]); + const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass(); + + if (NeedExecCopyReservedReg) { + Register ReservedReg = MFI->getSGPRForEXECCopy(); + assert(ReservedReg && "Should have reserved an SGPR for EXEC copy."); + Register UnusedScratchReg = findUnusedRegister(MRI, LiveRegs, RC); + if (UnusedScratchReg) { + // If found any unused scratch SGPR, reserve the register itself for Exec + // copy and there is no need for any spills in that case. + MFI->setSGPRForEXECCopy(UnusedScratchReg); + LiveRegs.addReg(UnusedScratchReg); + } else { + // Needs spill. + assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedReg) && + "Re-reserving spill slot for EXEC copy register"); + getVGPRSpillLaneOrTempRegister(MF, LiveRegs, ReservedReg, RC, + /*IncludeScratchCopy=*/false); + } + } + // hasFP only knows about stack objects that already exist. We're now // determining the stack slots that will be created, so we have to predict // them. Stack objects force FP usage with calls. @@ -1499,7 +1538,10 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + bool NeedExecCopyReservedReg = false; + MachineInstr *ReturnMI = nullptr; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { // WRITELANE instructions used for SGPR spills can overwrite the inactive @@ -1516,6 +1558,25 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg()); else if (MI.getOpcode() == AMDGPU::V_READLANE_B32) MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg()); + else if (TII->isWWMRegSpillOpcode(MI.getOpcode())) + NeedExecCopyReservedReg = true; + else if (MI.getOpcode() == AMDGPU::SI_RETURN || + MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { + // We expect all return to be the same size. + assert(!ReturnMI || + (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) == + count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); }))); + ReturnMI = &MI; + } + } + } + + // Remove any VGPRs used in the return value because these do not need to be saved. + // This prevents CSR restore from clobbering return VGPRs. + if (ReturnMI) { + for (auto &Op : ReturnMI->operands()) { + if (Op.isReg()) + SavedVGPRs.reset(Op.getReg()); } } @@ -1528,7 +1589,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, if (!ST.hasGFX90AInsts()) SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask()); - determinePrologEpilogSGPRSaves(MF, SavedVGPRs); + determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg); // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't // allow the default insertion to handle them. diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index def07dc4b1f7..0060fc0be431 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -34,8 +34,8 @@ public: RegScavenger *RS = nullptr) const override; void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS = nullptr) const; - void determinePrologEpilogSGPRSaves(MachineFunction &MF, - BitVector &SavedRegs) const; + void determinePrologEpilogSGPRSaves(MachineFunction &MF, BitVector &SavedRegs, + bool NeedExecCopyReservedReg) const; void emitCSRSpillStores(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs, Register FrameReg, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e0ad11d5af24..3148f49ff0d5 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15,14 +15,17 @@ #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" #include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/APInt.h" #include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/UniformityAnalysis.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/ByteProvider.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" @@ -35,8 +38,9 @@ #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/ModRef.h" #include "llvm/Support/KnownBits.h" +#include "llvm/Support/ModRef.h" +#include <optional> using namespace llvm; @@ -55,14 +59,14 @@ static cl::opt<bool> UseDivergentRegisterIndexing( cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false)); -static bool hasFP32Denormals(const MachineFunction &MF) { +static bool denormalModeIsFlushAllF32(const MachineFunction &MF) { const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - return Info->getMode().allFP32Denormals(); + return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign(); } -static bool hasFP64FP16Denormals(const MachineFunction &MF) { +static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) { const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - return Info->getMode().allFP64FP16Denormals(); + return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign(); } static unsigned findFirstFreeSGPR(CCState &CCInfo) { @@ -215,6 +219,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::f64, Promote); AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); + setOperationAction(ISD::FSQRT, MVT::f64, Custom); + setOperationAction(ISD::SELECT_CC, {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand); @@ -244,13 +250,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal); - setOperationAction({ISD::ADDCARRY, ISD::SUBCARRY}, MVT::i32, Legal); + setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal); setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64, Expand); #if 0 - setOperationAction({ISD::ADDCARRY, ISD::SUBCARRY}, MVT::i64, Legal); + setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal); #endif // We only support LOAD/STORE and vector manipulation ops for vectors @@ -470,6 +476,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, MVT::f64, Custom); setOperationAction(ISD::FFLOOR, MVT::f64, Legal); + setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64}, + Legal); + setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom); setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); @@ -514,9 +523,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16); // F16 - VOP1 Actions. - setOperationAction( - {ISD::FP_ROUND, ISD::FCOS, ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND}, - MVT::f16, Custom); + setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS, + ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND}, + MVT::f16, Custom); setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom); @@ -526,7 +535,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // F16 - VOP2 Actions. setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand); - + setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom); + setOperationAction(ISD::FFREXP, MVT::f16, Custom); setOperationAction(ISD::FDIV, MVT::f16, Custom); // F16 - VOP3 Actions. @@ -728,25 +738,25 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16, - MVT::v2i16, MVT::v2f16}, + MVT::v2i16, MVT::v2f16, MVT::i128}, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16, - MVT::i16, MVT::i8}, + MVT::i16, MVT::i8, MVT::i128}, Custom); setOperationAction(ISD::INTRINSIC_VOID, {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16, - MVT::i8}, + MVT::i8, MVT::i128}, Custom); setTargetDAGCombine({ISD::ADD, - ISD::ADDCARRY, + ISD::UADDO_CARRY, ISD::SUB, - ISD::SUBCARRY, + ISD::USUBO_CARRY, ISD::FADD, ISD::FSUB, ISD::FMINNUM, @@ -769,7 +779,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND_INREG, ISD::EXTRACT_VECTOR_ELT, - ISD::INSERT_VECTOR_ELT}); + ISD::INSERT_VECTOR_ELT, + ISD::FCOPYSIGN}); + + if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16()) + setTargetDAGCombine(ISD::FP_ROUND); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -791,6 +805,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::ATOMIC_LOAD_UMIN, ISD::ATOMIC_LOAD_UMAX, ISD::ATOMIC_LOAD_FADD, + ISD::ATOMIC_LOAD_UINC_WRAP, + ISD::ATOMIC_LOAD_UDEC_WRAP, ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN}); @@ -816,10 +832,10 @@ bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const { return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && - DestVT.getScalarType() == MVT::f32 && - SrcVT.getScalarType() == MVT::f16 && - // TODO: This probably only requires no input flushing? - !hasFP32Denormals(DAG.getMachineFunction()); + DestVT.getScalarType() == MVT::f32 && + SrcVT.getScalarType() == MVT::f16 && + // TODO: This probably only requires no input flushing? + denormalModeIsFlushAllF32(DAG.getMachineFunction()); } bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode, @@ -829,7 +845,7 @@ bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode, DestTy.getScalarSizeInBits() == 32 && SrcTy.getScalarSizeInBits() == 16 && // TODO: This probably only requires no input flushing? - !hasFP32Denormals(*MI.getMF()); + denormalModeIsFlushAllF32(*MI.getMF()); } bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { @@ -976,6 +992,26 @@ static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) { return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes); } +/// Map address space 7 to MVT::v5i32 because that's its in-memory +/// representation. This return value is vector-typed because there is no +/// MVT::i160 and it is not clear if one can be added. While this could +/// cause issues during codegen, these address space 7 pointers will be +/// rewritten away by then. Therefore, we can return MVT::v5i32 in order +/// to allow pre-codegen passes that query TargetTransformInfo, often for cost +/// modeling, to work. +MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const { + if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160) + return MVT::v5i32; + return AMDGPUTargetLowering::getPointerTy(DL, AS); +} +/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka +/// v8i32 when padding is added. +MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const { + if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160) + return MVT::v8i32; + return AMDGPUTargetLowering::getPointerMemTy(DL, AS); +} + bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, @@ -993,11 +1029,22 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return false; // TODO: Should images get their own address space? - Info.fallbackAddressSpace = AMDGPUAS::BUFFER_FAT_POINTER; + Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; if (RsrcIntr->IsImage) Info.align.reset(); + Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg); + if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) { + if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE) + // We conservatively set the memory operand of a buffer intrinsic to the + // base resource pointer, so that we can access alias information about + // those pointers. Cases like "this points at the same value + // but with a different offset" are handled in + // areMemAccessesTriviallyDisjoint. + Info.ptrVal = RsrcArg; + } + Info.flags |= MachineMemOperand::MODereferenceable; if (ME.onlyReadsMemory()) { unsigned MaxNumLanes = 4; @@ -1050,7 +1097,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, default: break; case Intrinsic::amdgcn_raw_buffer_load_lds: - case Intrinsic::amdgcn_struct_buffer_load_lds: { + case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: + case Intrinsic::amdgcn_struct_buffer_load_lds: + case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); return true; @@ -1061,8 +1110,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, } switch (IntrID) { - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_fadd: @@ -1083,7 +1130,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_buffer_atomic_fadd: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); - Info.fallbackAddressSpace = AMDGPUAS::BUFFER_FAT_POINTER; + Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; Info.align.reset(); Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; @@ -1093,6 +1140,15 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return true; } + case Intrinsic::amdgcn_ds_add_gs_reg_rtn: + case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); + Info.ptrVal = nullptr; + Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + return true; + } case Intrinsic::amdgcn_ds_append: case Intrinsic::amdgcn_ds_consume: { Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -1121,7 +1177,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT? - Info.fallbackAddressSpace = AMDGPUAS::BUFFER_FAT_POINTER; + Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE; Info.align.reset(); Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable; @@ -1204,8 +1260,6 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, SmallVectorImpl<Value*> &Ops, Type *&AccessTy) const { switch (II->getIntrinsicID()) { - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_append: @@ -1313,7 +1367,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, if (AS == AMDGPUAS::CONSTANT_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || - AS == AMDGPUAS::BUFFER_FAT_POINTER) { + AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE) { // If the offset isn't a multiple of 4, it probably isn't going to be // correctly aligned. // FIXME: Can we get the real alignment here? @@ -1336,12 +1390,16 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, // in 8-bits, it can use a smaller encoding. if (!isUInt<32>(AM.BaseOffs / 4)) return false; - } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) { // On VI, these use the SMEM format and the offset is 20-bit in bytes. if (!isUInt<20>(AM.BaseOffs)) return false; - } else - llvm_unreachable("unhandled generation"); + } else { + // On GFX9 the offset is signed 21-bit in bytes (but must not be negative + // for S_BUFFER_* instructions). + if (!isInt<21>(AM.BaseOffs)) + return false; + } if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. return true; @@ -1350,11 +1408,12 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return true; return false; + } - } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { + if (AS == AMDGPUAS::PRIVATE_ADDRESS) return isLegalMUBUFAddressingMode(AM); - } else if (AS == AMDGPUAS::LOCAL_ADDRESS || - AS == AMDGPUAS::REGION_ADDRESS) { + + if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { // Basic, single offset DS instructions allow a 16-bit unsigned immediate // field. // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have @@ -1369,8 +1428,9 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return true; return false; - } else if (AS == AMDGPUAS::FLAT_ADDRESS || - AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) { + } + + if (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) { // For an unknown address space, this usually means that this is for some // reason being used for pure arithmetic, and not based on some addressing // computation. We don't have instructions that compute pointers with any @@ -1544,18 +1604,14 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( return AlignedBy4; } - if (Subtarget->hasUnalignedBufferAccessEnabled()) { - // If we have a uniform constant load, it still requires using a slow - // buffer instruction if unaligned. - if (IsFast) { - // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so - // 2-byte alignment is worse than 1 unless doing a 2-byte access. - *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || - AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ? - Alignment >= Align(4) : Alignment != Align(2); - } + // So long as they are correct, wide global memory operations perform better + // than multiple smaller memory ops -- even when misaligned + if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) { + if (IsFast) + *IsFast = Size; - return true; + return Alignment >= Align(4) || + Subtarget->hasUnalignedBufferAccessEnabled(); } // Smaller than dword value must be aligned. @@ -1864,7 +1920,7 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG, return DAG.getUNDEF(VT); } - return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT); + return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg); } static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, @@ -2082,7 +2138,9 @@ void SITargetLowering::allocateSpecialInputSGPRs( if (Info.hasDispatchPtr()) allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr); - if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) + const Module *M = MF.getFunction().getParent(); + if (Info.hasQueuePtr() && + AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr); // Implicit arg ptr takes the place of the kernarg segment pointer. This is a @@ -2132,7 +2190,9 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, CCInfo.AllocateReg(DispatchPtrReg); } - if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) { + const Module *M = MF.getFunction().getParent(); + if (Info.hasQueuePtr() && + AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) { Register QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); @@ -2175,11 +2235,16 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const { + bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs(); if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) { // Note: user SGPRs are handled by the front-end for graphics shaders // Pad up the used user SGPRs with dead inputs. - unsigned CurrentUserSGPRs = Info.getNumUserSGPRs(); + // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately + // before enabling architected SGPRs for workgroup IDs. + assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget"); + + unsigned CurrentUserSGPRs = Info.getNumUserSGPRs(); // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to // rely on it to reach 16 since if we end up having no stack usage, it will // not really be added. @@ -2195,20 +2260,26 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, } if (Info.hasWorkGroupIDX()) { - Register Reg = Info.addWorkGroupIDX(); - MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + Register Reg = Info.addWorkGroupIDX(HasArchitectedSGPRs); + if (!HasArchitectedSGPRs) + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(Reg); } if (Info.hasWorkGroupIDY()) { - Register Reg = Info.addWorkGroupIDY(); - MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + Register Reg = Info.addWorkGroupIDY(HasArchitectedSGPRs); + if (!HasArchitectedSGPRs) + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(Reg); } if (Info.hasWorkGroupIDZ()) { - Register Reg = Info.addWorkGroupIDZ(); - MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + Register Reg = Info.addWorkGroupIDZ(HasArchitectedSGPRs); + if (!HasArchitectedSGPRs) + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(Reg); } @@ -2395,8 +2466,6 @@ SDValue SITargetLowering::LowerFormalArguments( return DAG.getEntryNode(); } - Info->allocateKnownAddressLDSGlobal(Fn); - SmallVector<ISD::InputArg, 16> Splits; SmallVector<CCValAssign, 16> ArgLocs; BitVector Skipped(Ins.size()); @@ -2409,11 +2478,14 @@ SDValue SITargetLowering::LowerFormalArguments( if (IsGraphics) { assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && - (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) && - !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && - !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && - !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() && - !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()); + !Info->hasWorkGroupInfo() && !Info->hasLDSKernelId() && + !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && + !Info->hasWorkItemIDZ()); + if (!Subtarget->enableFlatScratch()) + assert(!Info->hasFlatScratchInit()); + if (CallConv != CallingConv::AMDGPU_CS || !Subtarget->hasArchitectedSGPRs()) + assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && + !Info->hasWorkGroupIDZ()); } if (CallConv == CallingConv::AMDGPU_PS) { @@ -2451,7 +2523,7 @@ SDValue SITargetLowering::LowerFormalArguments( unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); if ((PsInputBits & 0x7F) == 0 || ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1))) - Info->markPSInputEnabled(countTrailingZeros(Info->getPSInputAddr())); + Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr())); } } else if (IsKernel) { assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); @@ -2610,7 +2682,7 @@ SDValue SITargetLowering::LowerFormalArguments( DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo()); - unsigned StackArgSize = CCInfo.getNextStackOffset(); + unsigned StackArgSize = CCInfo.getStackSize(); Info->setBytesInStackArgArea(StackArgSize); return Chains.empty() ? Chain : @@ -2632,7 +2704,17 @@ bool SITargetLowering::CanLowerReturn( SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context); - return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)); + if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg))) + return false; + + // We must use the stack if return would require unavailable registers. + unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF); + unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); + for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) + if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i))) + return false; + + return true; } SDValue @@ -2665,7 +2747,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // Analyze outgoing return values. CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); - SDValue Flag; + SDValue Glue; SmallVector<SDValue, 48> RetOps; RetOps.push_back(Chain); // Operand #0 = Chain (updated below) @@ -2697,8 +2779,8 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, llvm_unreachable("Unknown loc info!"); } - Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); - Flag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue); + Glue = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } @@ -2721,17 +2803,17 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // Update chain and glue. RetOps[0] = Chain; - if (Flag.getNode()) - RetOps.push_back(Flag); + if (Glue.getNode()) + RetOps.push_back(Glue); unsigned Opc = AMDGPUISD::ENDPGM; if (!IsWaveEnd) - Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG; + Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE; return DAG.getNode(Opc, DL, MVT::Other, RetOps); } SDValue SITargetLowering::LowerCallResult( - SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg, + SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn, SDValue ThisVal) const { @@ -2749,9 +2831,9 @@ SDValue SITargetLowering::LowerCallResult( SDValue Val; if (VA.isRegLoc()) { - Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); + Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue); Chain = Val.getValue(1); - InFlag = Val.getValue(2); + InGlue = Val.getValue(2); } else if (VA.isMemLoc()) { report_fatal_error("TODO: return values in memory"); } else @@ -3066,7 +3148,7 @@ bool SITargetLowering::isEligibleForTailCallOptimization( // If the stack arguments for this call do not fit into our own save area then // the call cannot be made tail. // TODO: Is this really necessary? - if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) + if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) return false; const MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -3122,21 +3204,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, "unsupported required tail call to function "); } - if (AMDGPU::isShader(CallConv)) { - // Note the issue is with the CC of the called function, not of the call - // itself. - return lowerUnhandledCall(CLI, InVals, - "unsupported call to a shader function "); - } - - if (AMDGPU::isShader(MF.getFunction().getCallingConv()) && - CallConv != CallingConv::AMDGPU_Gfx) { - // Only allow calls with specific calling conventions. - return lowerUnhandledCall(CLI, InVals, - "unsupported calling convention for call from " - "graphics shader of function "); - } - if (IsTailCall) { IsTailCall = isEligibleForTailCallOptimization( Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); @@ -3173,7 +3240,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, CCInfo.AnalyzeCallOperands(Outs, AssignFn); // Get a count of how many bytes are to be pushed on the stack. - unsigned NumBytes = CCInfo.getNextStackOffset(); + unsigned NumBytes = CCInfo.getStackSize(); if (IsSibCall) { // Since we're not changing the ABI to make this a tail call, the memory @@ -3309,11 +3376,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. - SDValue InFlag; + SDValue InGlue; for (auto &RegToPass : RegsToPass) { Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, - RegToPass.second, InFlag); - InFlag = Chain.getValue(1); + RegToPass.second, InGlue); + InGlue = Chain.getValue(1); } @@ -3322,8 +3389,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // we've carefully laid out the parameters so that when sp is reset they'll be // in the correct location. if (IsTailCall && !IsSibCall) { - Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InFlag, DL); - InFlag = Chain.getValue(1); + Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL); + InGlue = Chain.getValue(1); } std::vector<SDValue> Ops; @@ -3359,8 +3426,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); - if (InFlag.getNode()) - Ops.push_back(InFlag); + if (InGlue.getNode()) + Ops.push_back(InGlue); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); @@ -3368,22 +3435,24 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // actual call instruction. if (IsTailCall) { MFI.setHasTailCall(); - return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops); + unsigned OPC = CallConv == CallingConv::AMDGPU_Gfx ? + AMDGPUISD::TC_RETURN_GFX : AMDGPUISD::TC_RETURN; + return DAG.getNode(OPC, DL, NodeTys, Ops); } // Returns a chain and a flag for retval copy to use. SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops); Chain = Call.getValue(0); - InFlag = Call.getValue(1); + InGlue = Call.getValue(1); uint64_t CalleePopBytes = NumBytes; - Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InFlag, DL); + Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL); if (!Ins.empty()) - InFlag = Chain.getValue(1); + InGlue = Chain.getValue(1); // Handle result values, copying them out of physregs into vregs that we // return. - return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, + return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG, InVals, IsThisReturn, IsThisReturn ? OutVals[0] : SDValue()); } @@ -4000,6 +4069,120 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, return LoopBB; } +static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, + MachineBasicBlock &BB, + const GCNSubtarget &ST, + unsigned Opc) { + MachineRegisterInfo &MRI = BB.getParent()->getRegInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + // Reduction operations depend on whether the input operand is SGPR or VGPR. + Register SrcReg = MI.getOperand(1).getReg(); + bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg)); + Register DstReg = MI.getOperand(0).getReg(); + MachineBasicBlock *RetBB = nullptr; + if (isSGPR) { + // These operations with a uniform value i.e. SGPR are idempotent. + // Reduced value will be same as given sgpr. + BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg); + RetBB = &BB; + } else { + // TODO: Implement DPP Strategy and switch based on immediate strategy + // operand. For now, for all the cases (default, Iterative and DPP we use + // iterative approach by default.) + + // To reduce the VGPR using iterative approach, we need to iterate + // over all the active lanes. Lowering consists of ComputeLoop, + // which iterate over only active lanes. We use copy of EXEC register + // as induction variable and every active lane modifies it using bitset0 + // so that we will get the next active lane for next iteration. + MachineBasicBlock::iterator I = BB.end(); + Register SrcReg = MI.getOperand(1).getReg(); + + // Create Control flow for loop + // Split MI's Machine Basic block into For loop + auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true); + + // Create virtual registers required for lowering. + const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); + const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); + Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass); + Register InitalValReg = MRI.createVirtualRegister(DstRegClass); + + Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass); + Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass); + Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass); + + Register FF1Reg = MRI.createVirtualRegister(DstRegClass); + Register LaneValueReg = MRI.createVirtualRegister(DstRegClass); + + bool IsWave32 = ST.isWave32(); + unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + + // Create initail values of induction variable from Exec, Accumulator and + // insert branch instr to newly created ComputeBlockk + uint32_t InitalValue = + (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0; + auto TmpSReg = + BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg); + BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg) + .addImm(InitalValue); + BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop); + + // Start constructing ComputeLoop + I = ComputeLoop->end(); + auto Accumulator = + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg) + .addReg(InitalValReg) + .addMBB(&BB); + auto ActiveBits = + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg) + .addReg(TmpSReg->getOperand(0).getReg()) + .addMBB(&BB); + + // Perform the computations + unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64; + auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg) + .addReg(ActiveBits->getOperand(0).getReg()); + auto LaneValue = BuildMI(*ComputeLoop, I, DL, + TII->get(AMDGPU::V_READLANE_B32), LaneValueReg) + .addReg(SrcReg) + .addReg(FF1->getOperand(0).getReg()); + auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) + .addReg(Accumulator->getOperand(0).getReg()) + .addReg(LaneValue->getOperand(0).getReg()); + + // Manipulate the iterator to get the next active lane + unsigned BITSETOpc = + IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64; + auto NewActiveBits = + BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg) + .addReg(FF1->getOperand(0).getReg()) + .addReg(ActiveBits->getOperand(0).getReg()); + + // Add phi nodes + Accumulator.addReg(NewAccumulator->getOperand(0).getReg()) + .addMBB(ComputeLoop); + ActiveBits.addReg(NewActiveBits->getOperand(0).getReg()) + .addMBB(ComputeLoop); + + // Creating branching + unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64; + BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc)) + .addReg(NewActiveBits->getOperand(0).getReg()) + .addImm(0); + BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) + .addMBB(ComputeLoop); + + RetBB = ComputeEnd; + } + MI.eraseFromParent(); + return RetBB; +} + MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { @@ -4008,6 +4191,10 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); switch (MI.getOpcode()) { + case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32); + case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32: + return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32); case AMDGPU::S_UADDO_PSEUDO: case AMDGPU::S_USUBO_PSEUDO: { const DebugLoc &DL = MI.getDebugLoc(); @@ -4460,15 +4647,54 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( return BB; } + case AMDGPU::S_INVERSE_BALLOT_U32: + case AMDGPU::S_INVERSE_BALLOT_U64: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + const Register DstReg = MI.getOperand(0).getReg(); + Register MaskReg = MI.getOperand(1).getReg(); + + const bool IsVALU = TRI->isVectorRegister(MRI, MaskReg); + + if (IsVALU) { + MaskReg = TII->readlaneVGPRToSGPR(MaskReg, MI, MRI); + } + + BuildMI(*BB, &MI, DL, TII->get(AMDGPU::COPY), DstReg).addReg(MaskReg); + MI.eraseFromParent(); + return BB; + } + case AMDGPU::ENDPGM_TRAP: { + const DebugLoc &DL = MI.getDebugLoc(); + if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) { + MI.setDesc(TII->get(AMDGPU::S_ENDPGM)); + MI.addOperand(MachineOperand::CreateImm(0)); + return BB; + } + + // We need a block split to make the real endpgm a terminator. We also don't + // want to break phis in successor blocks, so we can't just delete to the + // end of the block. + + MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/); + MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); + MF->push_back(TrapBB); + BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM)) + .addImm(0); + BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addMBB(TrapBB); + + BB->addSuccessor(TrapBB); + MI.eraseFromParent(); + return SplitBB; + } default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } } -bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const { - return isTypeLegal(VT.getScalarType()); -} - bool SITargetLowering::hasAtomicFaddRtnForTy(SDValue &Op) const { switch (Op.getValue(0).getSimpleValueType().SimpleTy) { case MVT::f32: @@ -4542,7 +4768,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, // Otherwise f32 mad is always full rate and returns the same result as // the separate operations so should be preferred over fma. // However does not support denormals. - if (hasFP32Denormals(MF)) + if (!denormalModeIsFlushAllF32(MF)) return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts(); // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32. @@ -4551,7 +4777,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, case MVT::f64: return true; case MVT::f16: - return Subtarget->has16BitInsts() && hasFP64FP16Denormals(MF); + return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF); default: break; } @@ -4580,9 +4806,10 @@ bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const { return false; if (Ty.getScalarSizeInBits() == 16) - return Subtarget->hasMadF16() && !hasFP64FP16Denormals(*MI.getMF()); + return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF()); if (Ty.getScalarSizeInBits() == 32) - return Subtarget->hasMadMacF32Insts() && !hasFP32Denormals(*MI.getMF()); + return Subtarget->hasMadMacF32Insts() && + denormalModeIsFlushAllF32(*MI.getMF()); return false; } @@ -4594,10 +4821,10 @@ bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG, EVT VT = N->getValueType(0); if (VT == MVT::f32) return Subtarget->hasMadMacF32Insts() && - !hasFP32Denormals(DAG.getMachineFunction()); + denormalModeIsFlushAllF32(DAG.getMachineFunction()); if (VT == MVT::f16) { return Subtarget->hasMadF16() && - !hasFP64FP16Denormals(DAG.getMachineFunction()); + denormalModeIsFlushAllF64F16(DAG.getMachineFunction()); } return false; @@ -4613,7 +4840,10 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - assert(VT == MVT::v4f16 || VT == MVT::v4i16); + assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || + VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || + VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || + VT == MVT::v32f32); SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4696,12 +4926,16 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { "Load should return a value and a chain"); return Result; } - + case ISD::FSQRT: + if (Op.getValueType() == MVT::f64) + return lowerFSQRTF64(Op, DAG); + return SDValue(); case ISD::FSIN: case ISD::FCOS: return LowerTrig(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::FDIV: return LowerFDIV(Op, DAG); + case ISD::FFREXP: return LowerFFREXP(Op, DAG); case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::GlobalAddress: { @@ -4726,6 +4960,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BUILD_VECTOR: return lowerBUILD_VECTOR(Op, DAG); case ISD::FP_ROUND: + case ISD::STRICT_FP_ROUND: return lowerFP_ROUND(Op, DAG); case ISD::FPTRUNC_ROUND: { unsigned Opc; @@ -4757,6 +4992,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FMINNUM: case ISD::FMAXNUM: return lowerFMINNUM_FMAXNUM(Op, DAG); + case ISD::FLDEXP: + case ISD::STRICT_FLDEXP: + return lowerFLDEXP(Op, DAG); case ISD::FMA: return splitTernaryVectorOp(Op, DAG); case ISD::FP_TO_SINT: @@ -5038,6 +5276,9 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); switch (IID) { + case Intrinsic::amdgcn_make_buffer_rsrc: + Results.push_back(lowerPointerAsRsrcIntrin(N, DAG)); + return; case Intrinsic::amdgcn_cvt_pkrtz: { SDValue Src0 = N->getOperand(1); SDValue Src1 = N->getOperand(2); @@ -5142,6 +5383,7 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, return; } default: + AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); break; } } @@ -5349,6 +5591,10 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { if (SrcVT != MVT::f64) return Op; + // TODO: Handle strictfp + if (Op.getOpcode() != ISD::FP_ROUND) + return Op; + SDLoc DL(Op); SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src); @@ -5375,6 +5621,40 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, return Op; } +SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const { + bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP; + EVT VT = Op.getValueType(); + assert(VT == MVT::f16); + + SDValue Exp = Op.getOperand(IsStrict ? 2 : 1); + EVT ExpVT = Exp.getValueType(); + if (ExpVT == MVT::i16) + return Op; + + SDLoc DL(Op); + + // Correct the exponent type for f16 to i16. + // Clamp the range of the exponent to the instruction's range. + + // TODO: This should be a generic narrowing legalization, and can easily be + // for GlobalISel. + + SDValue MinExp = DAG.getConstant(minIntN(16), DL, ExpVT); + SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp); + + SDValue MaxExp = DAG.getConstant(maxIntN(16), DL, ExpVT); + SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp); + + SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp); + + if (IsStrict) { + return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other}, + {Op.getOperand(0), Op.getOperand(1), TruncExp}); + } + + return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp); +} + SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc SL(Op); @@ -5431,26 +5711,20 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) return lowerTrapEndpgm(Op, DAG); - if (std::optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(Subtarget)) { - switch (*HsaAbiVer) { - case ELF::ELFABIVERSION_AMDGPU_HSA_V2: - case ELF::ELFABIVERSION_AMDGPU_HSA_V3: - return lowerTrapHsaQueuePtr(Op, DAG); - case ELF::ELFABIVERSION_AMDGPU_HSA_V4: - case ELF::ELFABIVERSION_AMDGPU_HSA_V5: - return Subtarget->supportsGetDoorbellID() ? - lowerTrapHsa(Op, DAG) : lowerTrapHsaQueuePtr(Op, DAG); - } - } + const Module *M = DAG.getMachineFunction().getFunction().getParent(); + unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M); + if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3) + return lowerTrapHsaQueuePtr(Op, DAG); - llvm_unreachable("Unknown trap handler"); + return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) : + lowerTrapHsaQueuePtr(Op, DAG); } SDValue SITargetLowering::lowerTrapEndpgm( SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Chain = Op.getOperand(0); - return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain); + return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain); } SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT, @@ -5471,7 +5745,8 @@ SDValue SITargetLowering::lowerTrapHsaQueuePtr( SDValue QueuePtr; // For code object version 5, QueuePtr is passed through implicit kernarg. - if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { + const Module *M = DAG.getMachineFunction().getFunction().getParent(); + if (AMDGPU::getCodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) { QueuePtr = loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR); } else { @@ -5574,7 +5849,8 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, // For code object version 5, private_base and shared_base are passed through // implicit kernargs. - if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { + const Module *M = DAG.getMachineFunction().getFunction().getParent(); + if (AMDGPU::getCodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) { ImplicitParameter Param = (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE; return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param); @@ -5721,6 +5997,35 @@ SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); SDLoc SL(Op); + if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) { + // Insert 32-bit registers at a time. + assert(InsNumElts % 2 == 0 && "expect legal vector types"); + + unsigned VecNumElts = VecVT.getVectorNumElements(); + EVT NewVecVT = + EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2); + EVT NewInsVT = InsNumElts == 2 ? MVT::i32 + : EVT::getVectorVT(*DAG.getContext(), + MVT::i32, InsNumElts / 2); + + Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec); + Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins); + + for (unsigned I = 0; I != InsNumElts / 2; ++I) { + SDValue Elt; + if (InsNumElts == 2) { + Elt = Ins; + } else { + Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins, + DAG.getConstant(I, SL, MVT::i32)); + } + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt, + DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32)); + } + + return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec); + } + for (unsigned I = 0; I != InsNumElts; ++I) { SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins, DAG.getConstant(I, SL, MVT::i32)); @@ -6130,7 +6435,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) { assert(PtrVT == MVT::i32 && "32-bit pointer is expected."); // Adjust alignment for that dynamic shared memory array. - MFI->setDynLDSAlign(DAG.getDataLayout(), *cast<GlobalVariable>(GV)); + Function &F = DAG.getMachineFunction().getFunction(); + MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV)); return SDValue( DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0); } @@ -6572,15 +6878,24 @@ SDValue SITargetLowering::lowerImage(SDValue Op, // SIShrinkInstructions will convert NSA encodings to non-NSA after register // allocation when possible. // - // TODO: we can actually allow partial NSA where the final register is a - // contiguous set of the remaining addresses. - // This could help where there are more addresses than supported. - bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) && - VAddrs.size() >= (unsigned)ST->getNSAThreshold(MF) && - VAddrs.size() <= (unsigned)ST->getNSAMaxSize(); + // Partial NSA is allowed on GFX11 where the final register is a contiguous + // set of the remaining addresses. + const unsigned NSAMaxSize = ST->getNSAMaxSize(); + const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding(); + const bool UseNSA = ST->hasNSAEncoding() && + VAddrs.size() >= ST->getNSAThreshold(MF) && + (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding); + const bool UsePartialNSA = + UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize; + SDValue VAddr; - if (!UseNSA) + if (UsePartialNSA) { + VAddr = getBuildDwordsVector(DAG, DL, + ArrayRef(VAddrs).drop_front(NSAMaxSize - 1)); + } + else if (!UseNSA) { VAddr = getBuildDwordsVector(DAG, DL, VAddrs); + } SDValue True = DAG.getTargetConstant(1, DL, MVT::i1); SDValue False = DAG.getTargetConstant(0, DL, MVT::i1); @@ -6648,7 +6963,11 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SmallVector<SDValue, 26> Ops; if (BaseOpcode->Store || BaseOpcode->Atomic) Ops.push_back(VData); // vdata - if (UseNSA) + if (UsePartialNSA) { + append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1)); + Ops.push_back(VAddr); + } + else if (UseNSA) append_range(Ops, VAddrs); else Ops.push_back(VAddr); @@ -6696,7 +7015,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op, Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a, NumVDataDwords, NumVAddrDwords); if (Opcode == -1) - return makeV_ILLEGAL(Op, DAG); + report_fatal_error( + "requested image instruction is not supported on this GPU"); } if (Opcode == -1 && Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) @@ -6706,7 +7026,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op, Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, NumVDataDwords, NumVAddrDwords); } - assert(Opcode != -1); + if (Opcode == -1) + return Op; MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops); if (auto MemOp = dyn_cast<MemSDNode>(Op)) { @@ -7021,8 +7342,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return emitRemovedIntrinsicError(DAG, DL, VT); } case Intrinsic::amdgcn_ldexp: - return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, - Op.getOperand(1), Op.getOperand(2)); + return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(1), Op.getOperand(2)); case Intrinsic::amdgcn_fract: return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); @@ -7170,52 +7490,27 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } } -/// Update \p MMO based on the offset inputs to an intrinsic. -static void updateBufferMMO(MachineMemOperand *MMO, SDValue VOffset, - SDValue SOffset, SDValue Offset, - SDValue VIndex = SDValue()) { - if (!isa<ConstantSDNode>(VOffset) || !isa<ConstantSDNode>(SOffset) || - !isa<ConstantSDNode>(Offset)) { - // The combined offset is not known to be constant, so we cannot represent - // it in the MMO. Give up. - MMO->setValue((Value *)nullptr); - return; - } - - if (VIndex && (!isa<ConstantSDNode>(VIndex) || - !cast<ConstantSDNode>(VIndex)->isZero())) { - // The strided index component of the address is not known to be zero, so we - // cannot represent it in the MMO. Give up. - MMO->setValue((Value *)nullptr); - return; - } - - MMO->setOffset(cast<ConstantSDNode>(VOffset)->getSExtValue() + - cast<ConstantSDNode>(SOffset)->getSExtValue() + - cast<ConstantSDNode>(Offset)->getSExtValue()); -} - SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, unsigned NewOpcode) const { SDLoc DL(Op); SDValue VData = Op.getOperand(2); + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { - Op.getOperand(0), // Chain - VData, // vdata - Op.getOperand(3), // rsrc - DAG.getConstant(0, DL, MVT::i32), // vindex - Offsets.first, // voffset - Op.getOperand(5), // soffset - Offsets.second, // offset - Op.getOperand(6), // cachepolicy - DAG.getTargetConstant(0, DL, MVT::i1), // idxen + Op.getOperand(0), // Chain + VData, // vdata + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // cachepolicy + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; auto *M = cast<MemSDNode>(Op); - updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]); EVT MemVT = VData.getValueType(); return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT, @@ -7224,10 +7519,8 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op, // Return a value to use for the idxen operand by examining the vindex operand. static unsigned getIdxEn(SDValue VIndex) { - if (auto VIndexC = dyn_cast<ConstantSDNode>(VIndex)) - // No need to set idxen if vindex is known to be zero. - return VIndexC->getZExtValue() != 0; - return 1; + // No need to set idxen if vindex is known to be zero. + return isNullConstant(VIndex) ? 0 : 1; } SDValue @@ -7236,21 +7529,21 @@ SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, SDLoc DL(Op); SDValue VData = Op.getOperand(2); + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); SDValue Ops[] = { - Op.getOperand(0), // Chain - VData, // vdata - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - Offsets.first, // voffset - Op.getOperand(6), // soffset - Offsets.second, // offset - Op.getOperand(7), // cachepolicy - DAG.getTargetConstant(1, DL, MVT::i1), // idxen + Op.getOperand(0), // Chain + VData, // vdata + Rsrc, // rsrc + Op.getOperand(4), // vindex + Offsets.first, // voffset + Op.getOperand(6), // soffset + Offsets.second, // offset + Op.getOperand(7), // cachepolicy + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; auto *M = cast<MemSDNode>(Op); - updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]); EVT MemVT = VData.getValueType(); return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT, @@ -7330,19 +7623,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, M->getOperand(0), M->getOperand(2), M->getOperand(3), M->getMemOperand()); } - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { MemSDNode *M = cast<MemSDNode>(Op); unsigned Opc; switch (IntrID) { - case Intrinsic::amdgcn_atomic_inc: - Opc = AMDGPUISD::ATOMIC_INC; - break; - case Intrinsic::amdgcn_atomic_dec: - Opc = AMDGPUISD::ATOMIC_DEC; - break; case Intrinsic::amdgcn_ds_fmin: Opc = AMDGPUISD::ATOMIC_LOAD_FMIN; break; @@ -7384,7 +7669,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, EVT VT = Op.getValueType(); EVT IntVT = VT.changeTypeToInteger(); auto *M = cast<MemSDNode>(Op); - updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]); EVT LoadVT = Op.getValueType(); if (LoadVT.getScalarType() == MVT::f16) @@ -7400,43 +7684,50 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, M->getMemOperand(), DAG); } case Intrinsic::amdgcn_raw_buffer_load: - case Intrinsic::amdgcn_raw_buffer_load_format: { - const bool IsFormat = IntrID == Intrinsic::amdgcn_raw_buffer_load_format; + case Intrinsic::amdgcn_raw_ptr_buffer_load: + case Intrinsic::amdgcn_raw_buffer_load_format: + case Intrinsic::amdgcn_raw_ptr_buffer_load_format: { + const bool IsFormat = + IntrID == Intrinsic::amdgcn_raw_buffer_load_format || + IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format; + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - DAG.getConstant(0, DL, MVT::i32), // vindex - Offsets.first, // voffset - Op.getOperand(4), // soffset - Offsets.second, // offset - Op.getOperand(5), // cachepolicy, swizzled buffer - DAG.getTargetConstant(0, DL, MVT::i1), // idxen + Op.getOperand(0), // Chain + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(4), // soffset + Offsets.second, // offset + Op.getOperand(5), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; auto *M = cast<MemSDNode>(Op); - updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5]); return lowerIntrinsicLoad(M, IsFormat, DAG, Ops); } case Intrinsic::amdgcn_struct_buffer_load: - case Intrinsic::amdgcn_struct_buffer_load_format: { - const bool IsFormat = IntrID == Intrinsic::amdgcn_struct_buffer_load_format; + case Intrinsic::amdgcn_struct_ptr_buffer_load: + case Intrinsic::amdgcn_struct_buffer_load_format: + case Intrinsic::amdgcn_struct_ptr_buffer_load_format: { + const bool IsFormat = + IntrID == Intrinsic::amdgcn_struct_buffer_load_format || + IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format; + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Offsets.first, // voffset - Op.getOperand(5), // soffset - Offsets.second, // offset - Op.getOperand(6), // cachepolicy, swizzled buffer - DAG.getTargetConstant(1, DL, MVT::i1), // idxen + Op.getOperand(0), // Chain + Rsrc, // rsrc + Op.getOperand(3), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; - auto *M = cast<MemSDNode>(Op); - updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]); return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops); } case Intrinsic::amdgcn_tbuffer_load: { @@ -7467,21 +7758,23 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op->getVTList(), Ops, LoadVT, M->getMemOperand(), DAG); } - case Intrinsic::amdgcn_raw_tbuffer_load: { + case Intrinsic::amdgcn_raw_tbuffer_load: + case Intrinsic::amdgcn_raw_ptr_tbuffer_load: { MemSDNode *M = cast<MemSDNode>(Op); EVT LoadVT = Op.getValueType(); + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - DAG.getConstant(0, DL, MVT::i32), // vindex - Offsets.first, // voffset - Op.getOperand(4), // soffset - Offsets.second, // offset - Op.getOperand(5), // format - Op.getOperand(6), // cachepolicy, swizzled buffer - DAG.getTargetConstant(0, DL, MVT::i1), // idxen + Op.getOperand(0), // Chain + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(4), // soffset + Offsets.second, // offset + Op.getOperand(5), // format + Op.getOperand(6), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; if (LoadVT.getScalarType() == MVT::f16) @@ -7491,21 +7784,23 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op->getVTList(), Ops, LoadVT, M->getMemOperand(), DAG); } - case Intrinsic::amdgcn_struct_tbuffer_load: { + case Intrinsic::amdgcn_struct_tbuffer_load: + case Intrinsic::amdgcn_struct_ptr_tbuffer_load: { MemSDNode *M = cast<MemSDNode>(Op); EVT LoadVT = Op.getValueType(); + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // rsrc - Op.getOperand(3), // vindex - Offsets.first, // voffset - Op.getOperand(5), // soffset - Offsets.second, // offset - Op.getOperand(6), // format - Op.getOperand(7), // cachepolicy, swizzled buffer - DAG.getTargetConstant(1, DL, MVT::i1), // idxen + Op.getOperand(0), // Chain + Rsrc, // rsrc + Op.getOperand(3), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // format + Op.getOperand(7), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; if (LoadVT.getScalarType() == MVT::f16) @@ -7545,7 +7840,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); - updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]); unsigned Opcode = 0; switch (IntrID) { @@ -7593,69 +7887,99 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, M->getMemOperand()); } case Intrinsic::amdgcn_raw_buffer_atomic_fadd: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); case Intrinsic::amdgcn_struct_buffer_atomic_fadd: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); case Intrinsic::amdgcn_raw_buffer_atomic_fmin: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN); case Intrinsic::amdgcn_struct_buffer_atomic_fmin: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN); case Intrinsic::amdgcn_raw_buffer_atomic_fmax: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX); case Intrinsic::amdgcn_struct_buffer_atomic_fmax: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX); case Intrinsic::amdgcn_raw_buffer_atomic_swap: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP); case Intrinsic::amdgcn_raw_buffer_atomic_add: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD); case Intrinsic::amdgcn_raw_buffer_atomic_sub: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB); case Intrinsic::amdgcn_raw_buffer_atomic_smin: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN); case Intrinsic::amdgcn_raw_buffer_atomic_umin: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN); case Intrinsic::amdgcn_raw_buffer_atomic_smax: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX); case Intrinsic::amdgcn_raw_buffer_atomic_umax: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX); case Intrinsic::amdgcn_raw_buffer_atomic_and: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND); case Intrinsic::amdgcn_raw_buffer_atomic_or: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR); case Intrinsic::amdgcn_raw_buffer_atomic_xor: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR); case Intrinsic::amdgcn_raw_buffer_atomic_inc: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC); case Intrinsic::amdgcn_raw_buffer_atomic_dec: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); case Intrinsic::amdgcn_struct_buffer_atomic_swap: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP); case Intrinsic::amdgcn_struct_buffer_atomic_add: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD); case Intrinsic::amdgcn_struct_buffer_atomic_sub: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB); case Intrinsic::amdgcn_struct_buffer_atomic_smin: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN); case Intrinsic::amdgcn_struct_buffer_atomic_umin: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN); case Intrinsic::amdgcn_struct_buffer_atomic_smax: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX); case Intrinsic::amdgcn_struct_buffer_atomic_umax: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX); case Intrinsic::amdgcn_struct_buffer_atomic_and: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND); case Intrinsic::amdgcn_struct_buffer_atomic_or: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR); case Intrinsic::amdgcn_struct_buffer_atomic_xor: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR); case Intrinsic::amdgcn_struct_buffer_atomic_inc: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC); case Intrinsic::amdgcn_struct_buffer_atomic_dec: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); case Intrinsic::amdgcn_buffer_atomic_cmpswap: { @@ -7677,49 +8001,50 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); - updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); } - case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: { + case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: { + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // src - Op.getOperand(3), // cmp - Op.getOperand(4), // rsrc - DAG.getConstant(0, DL, MVT::i32), // vindex - Offsets.first, // voffset - Op.getOperand(6), // soffset - Offsets.second, // offset - Op.getOperand(7), // cachepolicy - DAG.getTargetConstant(0, DL, MVT::i1), // idxen + Op.getOperand(0), // Chain + Op.getOperand(2), // src + Op.getOperand(3), // cmp + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(6), // soffset + Offsets.second, // offset + Op.getOperand(7), // cachepolicy + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); - updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7]); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); } - case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: { + case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: { + SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG); SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // src - Op.getOperand(3), // cmp - Op.getOperand(4), // rsrc - Op.getOperand(5), // vindex - Offsets.first, // voffset - Op.getOperand(7), // soffset - Offsets.second, // offset - Op.getOperand(8), // cachepolicy - DAG.getTargetConstant(1, DL, MVT::i1), // idxen + Op.getOperand(0), // Chain + Op.getOperand(2), // src + Op.getOperand(3), // cmp + Rsrc, // rsrc + Op.getOperand(5), // vindex + Offsets.first, // voffset + Op.getOperand(7), // soffset + Offsets.second, // offset + Op.getOperand(8), // cachepolicy + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); - updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); @@ -7844,8 +8169,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, } Ops.push_back(TDescr); - if (IsA16) - Ops.push_back(DAG.getTargetConstant(1, DL, MVT::i1)); + Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1)); Ops.push_back(M->getChain()); auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops); @@ -7853,11 +8177,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, DAG.setNodeMemRefs(NewNode, {MemRef}); return SDValue(NewNode, 0); } - case Intrinsic::amdgcn_global_atomic_fadd: { - if (!Subtarget->hasAtomicFaddNoRtnInsts()) - return makeV_ILLEGAL(Op, DAG); - return SDValue(); - } case Intrinsic::amdgcn_global_atomic_fmin: case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fmin: @@ -8102,23 +8421,25 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, M->getMemoryVT(), M->getMemOperand()); } - case Intrinsic::amdgcn_struct_tbuffer_store: { + case Intrinsic::amdgcn_struct_tbuffer_store: + case Intrinsic::amdgcn_struct_ptr_tbuffer_store: { SDValue VData = Op.getOperand(2); bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); if (IsD16) VData = handleD16VData(VData, DAG); + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); SDValue Ops[] = { - Chain, - VData, // vdata - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - Offsets.first, // voffset - Op.getOperand(6), // soffset - Offsets.second, // offset - Op.getOperand(7), // format - Op.getOperand(8), // cachepolicy, swizzled buffer - DAG.getTargetConstant(1, DL, MVT::i1), // idxen + Chain, + VData, // vdata + Rsrc, // rsrc + Op.getOperand(4), // vindex + Offsets.first, // voffset + Op.getOperand(6), // soffset + Offsets.second, // offset + Op.getOperand(7), // format + Op.getOperand(8), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -8127,23 +8448,25 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, M->getMemoryVT(), M->getMemOperand()); } - case Intrinsic::amdgcn_raw_tbuffer_store: { + case Intrinsic::amdgcn_raw_tbuffer_store: + case Intrinsic::amdgcn_raw_ptr_tbuffer_store: { SDValue VData = Op.getOperand(2); bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); if (IsD16) VData = handleD16VData(VData, DAG); + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { - Chain, - VData, // vdata - Op.getOperand(3), // rsrc - DAG.getConstant(0, DL, MVT::i32), // vindex - Offsets.first, // voffset - Op.getOperand(5), // soffset - Offsets.second, // offset - Op.getOperand(6), // format - Op.getOperand(7), // cachepolicy, swizzled buffer - DAG.getTargetConstant(0, DL, MVT::i1), // idxen + Chain, + VData, // vdata + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // format + Op.getOperand(7), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -8178,7 +8501,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); - updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics EVT VDataType = VData.getValueType().getScalarType(); @@ -8190,9 +8512,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, } case Intrinsic::amdgcn_raw_buffer_store: - case Intrinsic::amdgcn_raw_buffer_store_format: { + case Intrinsic::amdgcn_raw_ptr_buffer_store: + case Intrinsic::amdgcn_raw_buffer_store_format: + case Intrinsic::amdgcn_raw_ptr_buffer_store_format: { const bool IsFormat = - IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format; + IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format || + IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format; SDValue VData = Op.getOperand(2); EVT VDataVT = VData.getValueType(); @@ -8209,23 +8534,23 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, getEquivalentMemType(*DAG.getContext(), VDataVT), VData); } + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { - Chain, - VData, - Op.getOperand(3), // rsrc - DAG.getConstant(0, DL, MVT::i32), // vindex - Offsets.first, // voffset - Op.getOperand(5), // soffset - Offsets.second, // offset - Op.getOperand(6), // cachepolicy, swizzled buffer - DAG.getTargetConstant(0, DL, MVT::i1), // idxen + Chain, + VData, + Rsrc, + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; unsigned Opc = IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); - updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) @@ -8236,9 +8561,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, } case Intrinsic::amdgcn_struct_buffer_store: - case Intrinsic::amdgcn_struct_buffer_store_format: { + case Intrinsic::amdgcn_struct_ptr_buffer_store: + case Intrinsic::amdgcn_struct_buffer_store_format: + case Intrinsic::amdgcn_struct_ptr_buffer_store_format: { const bool IsFormat = - IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format; + IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format || + IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format; SDValue VData = Op.getOperand(2); EVT VDataVT = VData.getValueType(); @@ -8256,23 +8584,23 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, getEquivalentMemType(*DAG.getContext(), VDataVT), VData); } + auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG); auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); SDValue Ops[] = { - Chain, - VData, - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - Offsets.first, // voffset - Op.getOperand(6), // soffset - Offsets.second, // offset - Op.getOperand(7), // cachepolicy, swizzled buffer - DAG.getTargetConstant(1, DL, MVT::i1), // idxen + Chain, + VData, + Rsrc, + Op.getOperand(4), // vindex + Offsets.first, // voffset + Op.getOperand(6), // soffset + Offsets.second, // offset + Op.getOperand(7), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; - unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ? - AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; + unsigned Opc = + !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); - updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics EVT VDataType = VData.getValueType().getScalarType(); @@ -8283,9 +8611,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, M->getMemoryVT(), M->getMemOperand()); } case Intrinsic::amdgcn_raw_buffer_load_lds: - case Intrinsic::amdgcn_struct_buffer_load_lds: { + case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: + case Intrinsic::amdgcn_struct_buffer_load_lds: + case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { unsigned Opc; - bool HasVIndex = IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds; + bool HasVIndex = + IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds || + IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds; unsigned OpOffset = HasVIndex ? 1 : 0; SDValue VOffset = Op.getOperand(5 + OpOffset); auto CVOffset = dyn_cast<ConstantSDNode>(VOffset); @@ -8328,7 +8660,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, else if (HasVOffset) Ops.push_back(VOffset); - Ops.push_back(Op.getOperand(2)); // rsrc + SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG); + Ops.push_back(Rsrc); Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset unsigned Aux = Op.getConstantOperandVal(8 + OpOffset); @@ -8341,8 +8674,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, auto *M = cast<MemSDNode>(Op); MachineMemOperand *LoadMMO = M->getMemOperand(); + // Don't set the offset value here because the pointer points to the base of + // the buffer. MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); - LoadPtrI.Offset = Op->getConstantOperandVal(7 + OpOffset); + MachinePointerInfo StorePtrI = LoadPtrI; StorePtrI.V = nullptr; StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; @@ -8450,27 +8785,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, } } -SDValue SITargetLowering::makeV_ILLEGAL(SDValue Op, SelectionDAG & DAG) const { - // Create the V_ILLEGAL node. - SDLoc DL(Op); - auto Opcode = Subtarget->getGeneration() < AMDGPUSubtarget::GFX10 ? - AMDGPU::V_ILLEGAL_gfx6_gfx7_gfx8_gfx9 : AMDGPU::V_ILLEGAL; - auto EntryNode = DAG.getEntryNode(); - auto IllegalNode = DAG.getMachineNode(Opcode, DL, MVT::Other, EntryNode); - auto IllegalVal = SDValue(IllegalNode, 0u); - - // Add the V_ILLEGAL node to the root chain to prevent its removal. - auto Chains = SmallVector<SDValue, 2u>(); - Chains.push_back(IllegalVal); - Chains.push_back(DAG.getRoot()); - auto Root = DAG.getTokenFactor(SDLoc(Chains.back()), Chains); - DAG.setRoot(Root); - - // Merge with UNDEF to satisfy return value requirements. - auto UndefVal = DAG.getUNDEF(Op.getValueType()); - return DAG.getMergeValues({UndefVal, IllegalVal}, DL); -} - // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: // offset (the offset that is included in bounds checking and swizzling, to be // split between the instruction's voffset and immoffset fields) and soffset @@ -8480,7 +8794,7 @@ SDValue SITargetLowering::makeV_ILLEGAL(SDValue Op, SelectionDAG & DAG) const { std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( SDValue Offset, SelectionDAG &DAG) const { SDLoc DL(Offset); - const unsigned MaxImm = 4095; + const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(); SDValue N0 = Offset; ConstantSDNode *C1 = nullptr; @@ -8493,13 +8807,14 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( if (C1) { unsigned ImmOffset = C1->getZExtValue(); - // If the immediate value is too big for the immoffset field, put the value - // and -4096 into the immoffset field so that the value that is copied/added - // for the voffset field is a multiple of 4096, and it stands more chance - // of being CSEd with the copy/add for another similar load/store. - // However, do not do that rounding down to a multiple of 4096 if that is a - // negative number, as it appears to be illegal to have a negative offset - // in the vgpr, even if adding the immediate offset makes it positive. + // If the immediate value is too big for the immoffset field, put only bits + // that would normally fit in the immoffset field. The remaining value that + // is copied/added for the voffset field is a large power of 2, and it + // stands more chance of being CSEd with the copy/add for another similar + // load/store. + // However, do not do that rounding down if that is a negative + // number, as it appears to be illegal to have a negative offset in the + // vgpr, even if adding the immediate offset makes it positive. unsigned Overflow = ImmOffset & ~MaxImm; ImmOffset -= Overflow; if ((int32_t)Overflow < 0) { @@ -8530,12 +8845,12 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, SDValue *Offsets, Align Alignment) const { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); SDLoc DL(CombinedOffset); - if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) { + if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) { uint32_t Imm = C->getZExtValue(); uint32_t SOffset, ImmOffset; - if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, - Alignment)) { + if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) { Offsets[0] = DAG.getConstant(0, DL, MVT::i32); Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); @@ -8547,8 +8862,8 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, SDValue N1 = CombinedOffset.getOperand(1); uint32_t SOffset, ImmOffset; int Offset = cast<ConstantSDNode>(N1)->getSExtValue(); - if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, - Subtarget, Alignment)) { + if (Offset >= 0 && + TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) { Offsets[0] = N0; Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); @@ -8560,6 +8875,55 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32); } +SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer, + SelectionDAG &DAG) const { + if (!MaybePointer.getValueType().isScalarInteger()) + return MaybePointer; + + SDLoc DL(MaybePointer); + + SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer); + return Rsrc; +} + +// Wrap a global or flat pointer into a buffer intrinsic using the flags +// specified in the intrinsic. +SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op, + SelectionDAG &DAG) const { + SDLoc Loc(Op); + + SDValue Pointer = Op->getOperand(1); + SDValue Stride = Op->getOperand(2); + SDValue NumRecords = Op->getOperand(3); + SDValue Flags = Op->getOperand(4); + + auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32); + SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32); + SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask); + std::optional<uint32_t> ConstStride = std::nullopt; + if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride)) + ConstStride = ConstNode->getZExtValue(); + + SDValue NewHighHalf = Masked; + if (!ConstStride || *ConstStride != 0) { + SDValue ShiftedStride; + if (ConstStride) { + ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32); + } else { + SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32); + ShiftedStride = + DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride, + DAG.getShiftAmountConstant(16, MVT::i32, Loc)); + } + NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride); + } + + SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, + NewHighHalf, NumRecords, Flags); + SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc); + return RsrcPtr; +} + // Handle 8 bit and 16 bit buffer loads SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, @@ -8683,6 +9047,14 @@ SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL); } +static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, + const SIMachineFunctionInfo &Info) { + // TODO: Should check if the address can definitely not access stack. + if (Info.isEntryFunction()) + return Info.hasFlatScratchInit(); + return true; +} + SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); LoadSDNode *Load = cast<LoadSDNode>(Op); @@ -8749,7 +9121,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { // then we need to use the same legalization rules we use for private. if (AS == AMDGPUAS::FLAT_ADDRESS && !Subtarget->hasMultiDwordFlatScratchAddressing()) - AS = MFI->hasFlatScratchInit() ? + AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ? AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; unsigned NumElements = MemVT.getVectorNumElements(); @@ -8883,26 +9255,30 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - bool AllowInaccurateRcp = Flags.hasApproximateFuncs(); - - // Without !fpmath accuracy information, we can't do more because we don't - // know exactly whether rcp is accurate enough to meet !fpmath requirement. - if (!AllowInaccurateRcp) - return SDValue(); + bool AllowInaccurateRcp = Flags.hasApproximateFuncs() || + DAG.getTarget().Options.UnsafeFPMath; if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { + // Without !fpmath accuracy information, we can't do more because we don't + // know exactly whether rcp is accurate enough to meet !fpmath requirement. + // f16 is always accurate enough + if (!AllowInaccurateRcp && VT != MVT::f16) + return SDValue(); + if (CLHS->isExactlyValue(1.0)) { // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to // the CI documentation has a worst case error of 1 ulp. // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to // use it as long as we aren't trying to use denormals. // - // v_rcp_f16 and v_rsq_f16 DO support denormals. + // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp. // 1.0 / sqrt(x) -> rsq(x) // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP // error seems really high at 2^29 ULP. + + // XXX - do we need afn for this or is arcp sufficent? if (RHS.getOpcode() == ISD::FSQRT) return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); @@ -8918,6 +9294,11 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, } } + // For f16 require arcp only. + // For f32 require afn+arcp. + if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal())) + return SDValue(); + // Turn into multiply by the reciprocal. // x / y -> x * (1.0 / y) SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); @@ -9017,16 +9398,17 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { // Faster 2.5 ULP division that does not support denormals. SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { + SDNodeFlags Flags = Op->getFlags(); SDLoc SL(Op); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); - SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags); - const APFloat K0Val(BitsToFloat(0x6f800000)); + const APFloat K0Val(0x1p+96f); const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); - const APFloat K1Val(BitsToFloat(0x2f800000)); + const APFloat K1Val(0x1p-32f); const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); @@ -9036,30 +9418,27 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); - SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags); - // TODO: Should this propagate fast-math-flags? - r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags); // rcp does not support denormals. - SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags); - return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); + return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags); } // Returns immediate value for setting the F32 denorm mode when using the // S_DENORM_MODE instruction. -static SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG, - const SDLoc &SL, const GCNSubtarget *ST) { +static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, + const SIMachineFunctionInfo *Info, + const GCNSubtarget *ST) { assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE"); - int DPDenormModeDefault = hasFP64FP16Denormals(DAG.getMachineFunction()) - ? FP_DENORM_FLUSH_NONE - : FP_DENORM_FLUSH_IN_FLUSH_OUT; - - int Mode = SPDenormMode | (DPDenormModeDefault << 2); - return DAG.getTargetConstant(Mode, SL, MVT::i32); + uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue(); + uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2); + return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32); } SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { @@ -9097,7 +9476,11 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32); - const bool HasFP32Denormals = hasFP32Denormals(DAG.getMachineFunction()); + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + const DenormalMode DenormMode = Info->getMode().FP32Denormals; + + const bool HasFP32Denormals = DenormMode == DenormalMode::getIEEE(); if (!HasFP32Denormals) { // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV @@ -9109,7 +9492,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDNode *EnableDenorm; if (Subtarget->hasDenormModeInst()) { const SDValue EnableDenormValue = - getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, SL, Subtarget); + getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget); EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, DAG.getEntryNode(), EnableDenormValue).getNode(); @@ -9149,10 +9532,13 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { NumeratorScaled, Fma3, Flags); if (!HasFP32Denormals) { + // FIXME: This mishandles dynamic denormal mode. We need to query the + // current mode and restore the original. + SDNode *DisableDenorm; if (Subtarget->hasDenormModeInst()) { - const SDValue DisableDenormValue = - getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, SL, Subtarget); + const SDValue DisableDenormValue = getSPDenormModeValue( + FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget); DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, Fma4.getValue(1), DisableDenormValue, @@ -9260,6 +9646,36 @@ SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("Unexpected type for fdiv"); } +SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const { + SDLoc dl(Op); + SDValue Val = Op.getOperand(0); + EVT VT = Val.getValueType(); + EVT ResultExpVT = Op->getValueType(1); + EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32; + + SDValue Mant = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val); + + SDValue Exp = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT, + DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val); + + if (Subtarget->hasFractBug()) { + SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val); + SDValue Inf = DAG.getConstantFP( + APFloat::getInf(SelectionDAG::EVTToAPFloatSemantics(VT)), dl, VT); + + SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT); + SDValue Zero = DAG.getConstant(0, dl, InstrExpVT); + Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero); + Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val); + } + + SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT); + return DAG.getMergeValues({Mant, CastExp}, dl); +} + SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); StoreSDNode *Store = cast<StoreSDNode>(Op); @@ -9287,7 +9703,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // then we need to use the same legalization rules we use for private. if (AS == AMDGPUAS::FLAT_ADDRESS && !Subtarget->hasMultiDwordFlatScratchAddressing()) - AS = MFI->hasFlatScratchInit() ? + AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ? AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; unsigned NumElements = VT.getVectorNumElements(); @@ -9338,6 +9754,87 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } +SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const { + // For double type, the SQRT and RSQ instructions don't have required + // precision, we apply Goldschmidt's algorithm to improve the result: + // + // y0 = rsq(x) + // g0 = x * y0 + // h0 = 0.5 * y0 + // + // r0 = 0.5 - h0 * g0 + // g1 = g0 * r0 + g0 + // h1 = h0 * r0 + h0 + // + // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 + // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 + // h2 = h1 * r1 + h1 + // + // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 + // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 + // + // sqrt(x) = g3 + + SDNodeFlags Flags = Op->getFlags(); + + SDLoc DL(Op); + + SDValue X = Op.getOperand(0); + SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64); + + SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT); + + SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32); + + // Scale up input if it is too small. + SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32); + SDValue ScaleUp = + DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt); + SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags); + + SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX); + + SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY); + + SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64); + SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half); + + SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0); + SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half); + + SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0); + + SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0); + + SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1); + SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX); + + SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1); + + SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2); + SDValue SqrtD1 = + DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX); + + SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2); + + SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32); + SDValue ScaleDown = + DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt); + SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags); + + // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check + // with finite only or nsz because rsq(+/-0) = +/-inf + + // TODO: Check for DAZ and expand to subnormals + SDValue IsZeroOrInf = + DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX, + DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32)); + + // If x is +INF, +0, or -0, use its original value + return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet, + Flags); +} + SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); @@ -9432,7 +9929,53 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performFCopySignCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SDValue MagnitudeOp = N->getOperand(0); + SDValue SignOp = N->getOperand(1); + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + // f64 fcopysign is really an f32 copysign on the high bits, so replace the + // lower half with a copy. + // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) + if (MagnitudeOp.getValueType() == MVT::f64) { + SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp); + SDValue MagLo = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, + DAG.getConstant(0, DL, MVT::i32)); + SDValue MagHi = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, + DAG.getConstant(1, DL, MVT::i32)); + + SDValue HiOp = + DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp); + + SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp); + + return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector); + } + + if (SignOp.getValueType() != MVT::f64) + return SDValue(); + + // Reduce width of sign operand, we only need the highest bit. + // + // fcopysign f64:x, f64:y -> + // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1) + // TODO: In some cases it might make sense to go all the way to f16. + SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp); + SDValue SignAsF32 = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, + DAG.getConstant(1, DL, MVT::i32)); + + return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0), + SignAsF32); +} + // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) +// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no +// bits // This is a variant of // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), @@ -9467,8 +10010,14 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, if (!CAdd) return SDValue(); - // If the resulting offset is too large, we can't fold it into the addressing - // mode offset. + SelectionDAG &DAG = DCI.DAG; + + if (N0->getOpcode() == ISD::OR && + !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) + return SDValue(); + + // If the resulting offset is too large, we can't fold it into the + // addressing mode offset. APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext()); @@ -9478,7 +10027,6 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace)) return SDValue(); - SelectionDAG &DAG = DCI.DAG; SDLoc SL(N); EVT VT = N->getValueType(0); @@ -9604,7 +10152,7 @@ static uint32_t getConstantPermuteMask(uint32_t C) { // value 0-3 selects corresponding source byte; // value 0xc selects zero; // value 0xff selects 0xff. -static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) { +static uint32_t getPermuteMask(SDValue V) { assert(V.getValueSizeInBits() == 32); if (V.getNumOperands() != 2) @@ -9620,15 +10168,13 @@ static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) { default: break; case ISD::AND: - if (uint32_t ConstMask = getConstantPermuteMask(C)) { + if (uint32_t ConstMask = getConstantPermuteMask(C)) return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask); - } break; case ISD::OR: - if (uint32_t ConstMask = getConstantPermuteMask(C)) { + if (uint32_t ConstMask = getConstantPermuteMask(C)) return (0x03020100 & ~ConstMask) | ConstMask; - } break; case ISD::SHL: @@ -9676,7 +10222,7 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) { if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) { unsigned Shift = CShift->getZExtValue(); - unsigned NB = CRHS->getAPIntValue().countTrailingZeros(); + unsigned NB = CRHS->getAPIntValue().countr_zero(); unsigned Offset = NB + Shift; if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary. SDLoc SL(N); @@ -9787,8 +10333,8 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { - uint32_t LHSMask = getPermuteMask(DAG, LHS); - uint32_t RHSMask = getPermuteMask(DAG, RHS); + uint32_t LHSMask = getPermuteMask(LHS); + uint32_t RHSMask = getPermuteMask(RHS); if (LHSMask != ~0u && RHSMask != ~0u) { // Canonicalize the expression in an attempt to have fewer unique masks // and therefore fewer registers used to hold the masks. @@ -9834,6 +10380,325 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, return SDValue(); } +// A key component of v_perm is a mapping between byte position of the src +// operands, and the byte position of the dest. To provide such, we need: 1. the +// node that provides x byte of the dest of the OR, and 2. the byte of the node +// used to provide that x byte. calculateByteProvider finds which node provides +// a certain byte of the dest of the OR, and calculateSrcByte takes that node, +// and finds an ultimate src and byte position For example: The supported +// LoadCombine pattern for vector loads is as follows +// t1 +// or +// / \ +// t2 t3 +// zext shl +// | | \ +// t4 t5 16 +// or anyext +// / \ | +// t6 t7 t8 +// srl shl or +// / | / \ / \ +// t9 t10 t11 t12 t13 t14 +// trunc* 8 trunc* 8 and and +// | | / | | \ +// t15 t16 t17 t18 t19 t20 +// trunc* 255 srl -256 +// | / \ +// t15 t15 16 +// +// *In this example, the truncs are from i32->i16 +// +// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3 +// respectively. calculateSrcByte would find (given node) -> ultimate src & +// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3. +// After finding the mapping, we can combine the tree into vperm t15, t16, +// 0x05000407 + +// Find the source and byte position from a node. +// \p DestByte is the byte position of the dest of the or that the src +// ultimately provides. \p SrcIndex is the byte of the src that maps to this +// dest of the or byte. \p Depth tracks how many recursive iterations we have +// performed. +static const std::optional<ByteProvider<SDValue>> +calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0, + unsigned Depth = 0) { + // We may need to recursively traverse a series of SRLs + if (Depth >= 6) + return std::nullopt; + + switch (Op->getOpcode()) { + case ISD::TRUNCATE: { + if (Op->getOperand(0).getScalarValueSizeInBits() != 32) + return std::nullopt; + return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); + } + + case ISD::SRL: { + auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); + if (!ShiftOp) + return std::nullopt; + + uint64_t BitShift = ShiftOp->getZExtValue(); + + if (BitShift % 8 != 0) + return std::nullopt; + + SrcIndex += BitShift / 8; + + return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); + } + + default: { + if (Op.getScalarValueSizeInBits() != 32) + return std::nullopt; + + return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex); + } + } + llvm_unreachable("fully handled switch"); +} + +// For a byte position in the result of an Or, traverse the tree and find the +// node (and the byte of the node) which ultimately provides this {Or, +// BytePosition}. \p Op is the operand we are currently examining. \p Index is +// the byte position of the Op that corresponds with the originally requested +// byte of the Or \p Depth tracks how many recursive iterations we have +// performed. \p StartingIndex is the originally requested byte of the Or +static const std::optional<ByteProvider<SDValue>> +calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, + unsigned StartingIndex = 0) { + // Finding Src tree of RHS of or typically requires at least 1 additional + // depth + if (Depth > 6) + return std::nullopt; + + unsigned BitWidth = Op.getScalarValueSizeInBits(); + if (BitWidth % 8 != 0) + return std::nullopt; + assert(Index < BitWidth / 8 && "invalid index requested"); + + switch (Op.getOpcode()) { + case ISD::OR: { + auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1, + StartingIndex); + if (!RHS) + return std::nullopt; + auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1, + StartingIndex); + if (!LHS) + return std::nullopt; + // A well formed Or will have two ByteProviders for each byte, one of which + // is constant zero + if (!LHS->isConstantZero() && !RHS->isConstantZero()) + return std::nullopt; + if (!LHS || LHS->isConstantZero()) + return RHS; + if (!RHS || RHS->isConstantZero()) + return LHS; + return std::nullopt; + } + + case ISD::AND: { + auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); + if (!BitMaskOp) + return std::nullopt; + + uint32_t BitMask = BitMaskOp->getZExtValue(); + // Bits we expect for our StartingIndex + uint32_t IndexMask = 0xFF << (Index * 8); + + if ((IndexMask & BitMask) != IndexMask) { + // If the result of the and partially provides the byte, then it + // is not well formatted + if (IndexMask & BitMask) + return std::nullopt; + return ByteProvider<SDValue>::getConstantZero(); + } + + return calculateSrcByte(Op->getOperand(0), StartingIndex, Index); + } + + case ISD::SRL: { + auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); + if (!ShiftOp) + return std::nullopt; + + uint64_t BitShift = ShiftOp->getZExtValue(); + if (BitShift % 8) + return std::nullopt; + + auto BitsProvided = Op.getScalarValueSizeInBits(); + if (BitsProvided % 8 != 0) + return std::nullopt; + + uint64_t BytesProvided = BitsProvided / 8; + uint64_t ByteShift = BitShift / 8; + // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes. + // If the byte we are trying to provide (as tracked by index) falls in this + // range, then the SRL provides the byte. The byte of interest of the src of + // the SRL is Index + ByteShift + return BytesProvided - ByteShift > Index + ? calculateSrcByte(Op->getOperand(0), StartingIndex, + Index + ByteShift) + : ByteProvider<SDValue>::getConstantZero(); + } + + case ISD::SHL: { + auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); + if (!ShiftOp) + return std::nullopt; + + uint64_t BitShift = ShiftOp->getZExtValue(); + if (BitShift % 8 != 0) + return std::nullopt; + uint64_t ByteShift = BitShift / 8; + + // If we are shifting by an amount greater than (or equal to) + // the index we are trying to provide, then it provides 0s. If not, + // then this bytes are not definitively 0s, and the corresponding byte + // of interest is Index - ByteShift of the src + return Index < ByteShift + ? ByteProvider<SDValue>::getConstantZero() + : calculateByteProvider(Op.getOperand(0), Index - ByteShift, + Depth + 1, StartingIndex); + } + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: { + SDValue NarrowOp = Op->getOperand(0); + unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return std::nullopt; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + if (Index >= NarrowByteWidth) + return Op.getOpcode() == ISD::ZERO_EXTEND + ? std::optional<ByteProvider<SDValue>>( + ByteProvider<SDValue>::getConstantZero()) + : std::nullopt; + return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex); + } + + case ISD::TRUNCATE: { + unsigned NarrowBitWidth = Op.getScalarValueSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return std::nullopt; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + if (NarrowByteWidth >= Index) { + return calculateByteProvider(Op.getOperand(0), Index, Depth + 1, + StartingIndex); + } + + return std::nullopt; + } + + case ISD::LOAD: { + auto L = cast<LoadSDNode>(Op.getNode()); + unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); + if (NarrowBitWidth % 8 != 0) + return std::nullopt; + uint64_t NarrowByteWidth = NarrowBitWidth / 8; + + // If the width of the load does not reach byte we are trying to provide for + // and it is not a ZEXTLOAD, then the load does not provide for the byte in + // question + if (Index >= NarrowByteWidth) { + return L->getExtensionType() == ISD::ZEXTLOAD + ? std::optional<ByteProvider<SDValue>>( + ByteProvider<SDValue>::getConstantZero()) + : std::nullopt; + } + + if (NarrowByteWidth > Index) { + return calculateSrcByte(Op, StartingIndex, Index); + } + + return std::nullopt; + } + + case ISD::BSWAP: + return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1, + Depth + 1, StartingIndex); + default: { + return std::nullopt; + } + } + + llvm_unreachable("fully handled switch"); +} + +// Returns true if the Operand is a scalar and is 16 bits +static bool is16BitScalarOp(SDValue &Operand) { + switch (Operand.getOpcode()) { + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: { + auto OpVT = Operand.getOperand(0).getValueType(); + return !OpVT.isVector() && OpVT.getSizeInBits() == 16; + } + case ISD::LOAD: { + LoadSDNode *L = cast<LoadSDNode>(Operand.getNode()); + auto ExtType = cast<LoadSDNode>(L)->getExtensionType(); + if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD || + ExtType == ISD::EXTLOAD) { + auto MemVT = L->getMemoryVT(); + return !MemVT.isVector() && MemVT.getSizeInBits() == 16; + } + return false; + } + default: + return false; + } +} + +// Returns true if the mask matches consecutive bytes, and the first byte +// begins at a power of 2 byte offset from 0th byte +static bool addresses16Bits(int Mask) { + int Low8 = Mask & 0xff; + int Hi8 = (Mask & 0xff00) >> 8; + + assert(Low8 < 8 && Hi8 < 8); + // Are the bytes contiguous in the order of increasing addresses. + bool IsConsecutive = (Hi8 - Low8 == 1); + // Is the first byte at location that is aligned for 16 bit instructions. + // A counter example is taking 2 consecutive bytes starting at the 8th bit. + // In this case, we still need code to extract the 16 bit operand, so it + // is better to use i8 v_perm + bool Is16Aligned = !(Low8 % 2); + + return IsConsecutive && Is16Aligned; +} + +// Do not lower into v_perm if the operands are actually 16 bit +// and the selected bits (based on PermMask) correspond with two +// easily addressable 16 bit operands. +static bool hasEightBitAccesses(uint64_t PermMask, SDValue &Op, + SDValue &OtherOp) { + int Low16 = PermMask & 0xffff; + int Hi16 = (PermMask & 0xffff0000) >> 16; + + // ByteProvider only accepts 32 bit operands + assert(Op.getValueType().getSizeInBits() == 32); + assert(OtherOp.getValueType().getSizeInBits() == 32); + + auto OpIs16Bit = is16BitScalarOp(Op); + auto OtherOpIs16Bit = is16BitScalarOp(Op); + + // If there is a size mismatch, then we must use masking on at least one + // operand + if (OpIs16Bit != OtherOpIs16Bit) + return true; + + // If both operands are 16 bit, return whether or not we cleanly address both + if (is16BitScalarOp(Op) && is16BitScalarOp(OtherOp)) + return !addresses16Bits(Low16) || !addresses16Bits(Hi16); + + // Both are 32 bit operands + return true; +} + SDValue SITargetLowering::performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -9884,8 +10749,36 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { - uint32_t LHSMask = getPermuteMask(DAG, LHS); - uint32_t RHSMask = getPermuteMask(DAG, RHS); + + // If all the uses of an or need to extract the individual elements, do not + // attempt to lower into v_perm + auto usesCombinedOperand = [](SDNode *OrUse) { + // If we have any non-vectorized use, then it is a candidate for v_perm + if (OrUse->getOpcode() != ISD::BITCAST || + !OrUse->getValueType(0).isVector()) + return true; + + // If we have any non-vectorized use, then it is a candidate for v_perm + for (auto VUse : OrUse->uses()) { + if (!VUse->getValueType(0).isVector()) + return true; + + // If the use of a vector is a store, then combining via a v_perm + // is beneficial. + // TODO -- whitelist more uses + for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg}) + if (VUse->getOpcode() == VectorwiseOp) + return true; + } + return false; + }; + + if (!any_of(N->uses(), usesCombinedOperand)) + return SDValue(); + + uint32_t LHSMask = getPermuteMask(LHS); + uint32_t RHSMask = getPermuteMask(RHS); + if (LHSMask != ~0u && RHSMask != ~0u) { // Canonicalize the expression in an attempt to have fewer unique masks // and therefore fewer registers used to hold the masks. @@ -9918,6 +10811,71 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, DAG.getConstant(Sel, DL, MVT::i32)); } } + if (LHSMask == ~0u || RHSMask == ~0u) { + SmallVector<ByteProvider<SDValue>, 8> PermNodes; + + // VT is known to be MVT::i32, so we need to provide 4 bytes. + assert(VT == MVT::i32); + for (int i = 0; i < 4; i++) { + // Find the ByteProvider that provides the ith byte of the result of OR + std::optional<ByteProvider<SDValue>> P = + calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i); + // TODO support constantZero + if (!P || P->isConstantZero()) + return SDValue(); + + PermNodes.push_back(*P); + } + if (PermNodes.size() != 4) + return SDValue(); + + int FirstSrc = 0; + std::optional<int> SecondSrc; + uint64_t permMask = 0x00000000; + for (size_t i = 0; i < PermNodes.size(); i++) { + auto PermOp = PermNodes[i]; + // Since the mask is applied to Src1:Src2, Src1 bytes must be offset + // by sizeof(Src2) = 4 + int SrcByteAdjust = 4; + + if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) { + if (SecondSrc.has_value()) + if (!PermOp.hasSameSrc(PermNodes[*SecondSrc])) + return SDValue(); + // Set the index of the second distinct Src node + SecondSrc = i; + assert(PermNodes[*SecondSrc].Src->getValueType().getSizeInBits() == + 32); + SrcByteAdjust = 0; + } + assert(PermOp.SrcOffset + SrcByteAdjust < 8); + assert(!DAG.getDataLayout().isBigEndian()); + permMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8); + } + + SDValue Op = *PermNodes[FirstSrc].Src; + SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src + : *PermNodes[FirstSrc].Src; + + // Check that we are not just extracting the bytes in order from an op + if (Op == OtherOp) { + int Low16 = permMask & 0xffff; + int Hi16 = (permMask & 0xffff0000) >> 16; + + bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100); + bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302); + + // The perm op would really just produce Op. So combine into Op + if (WellFormedLow && WellFormedHi) + return Op; + } + + if (hasEightBitAccesses(permMask, Op, OtherOp)) { + SDLoc DL(N); + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp, + DAG.getConstant(permMask, DL, MVT::i32)); + } + } } if (VT != MVT::i64 || DCI.isBeforeLegalizeOps()) @@ -9966,20 +10924,40 @@ SDValue SITargetLowering::performXorCombine(SDNode *N, if (SDValue RV = reassociateScalarOps(N, DCI.DAG)) return RV; - EVT VT = N->getValueType(0); - if (VT != MVT::i64) - return SDValue(); - SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS); - if (CRHS) { + SelectionDAG &DAG = DCI.DAG; + + EVT VT = N->getValueType(0); + if (CRHS && VT == MVT::i64) { if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS)) return Split; } + // Make sure to apply the 64-bit constant splitting fold before trying to fold + // fneg-like xors into 64-bit select. + if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) { + // This looks like an fneg, try to fold as a source modifier. + if (CRHS && CRHS->getAPIntValue().isSignMask() && + shouldFoldFNegIntoSrc(N, LHS)) { + // xor (select c, a, b), 0x80000000 -> + // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b))) + SDLoc DL(N); + SDValue CastLHS = + DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1)); + SDValue CastRHS = + DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2)); + SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS); + SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS); + SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32, + LHS->getOperand(0), FNegLHS, FNegRHS); + return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect); + } + } + return SDValue(); } @@ -10086,10 +11064,15 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, return true; if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) { - auto F = CFP->getValueAPF(); + const auto &F = CFP->getValueAPF(); if (F.isNaN() && F.isSignaling()) return false; - return !F.isDenormal() || denormalsEnabledForType(DAG, Op.getValueType()); + if (!F.isDenormal()) + return true; + + DenormalMode Mode = + DAG.getMachineFunction().getDenormalMode(F.getSemantics()); + return Mode == DenormalMode::getIEEE(); } // If source is a result of another standard FP operation it is already in @@ -10111,6 +11094,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, case ISD::FREM: case ISD::FP_ROUND: case ISD::FP_EXTEND: + case ISD::FLDEXP: case AMDGPUISD::FMUL_LEGACY: case AMDGPUISD::FMAD_FTZ: case AMDGPUISD::RCP: @@ -10118,11 +11102,12 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, case AMDGPUISD::RSQ_CLAMP: case AMDGPUISD::RCP_LEGACY: case AMDGPUISD::RCP_IFLAG: + case AMDGPUISD::LOG: + case AMDGPUISD::EXP: case AMDGPUISD::DIV_SCALE: case AMDGPUISD::DIV_FMAS: case AMDGPUISD::DIV_FIXUP: case AMDGPUISD::FRACT: - case AMDGPUISD::LDEXP: case AMDGPUISD::CVT_PKRTZ_F16_F32: case AMDGPUISD::CVT_F32_UBYTE0: case AMDGPUISD::CVT_F32_UBYTE1: @@ -10156,6 +11141,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, // snans will be quieted, so we only need to worry about denormals. if (Subtarget->supportsMinMaxDenormModes() || + // FIXME: denormalsEnabledForType is broken for dynamic denormalsEnabledForType(DAG, Op.getValueType())) return true; @@ -10225,6 +11211,8 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, case Intrinsic::amdgcn_rcp_legacy: case Intrinsic::amdgcn_rsq_legacy: case Intrinsic::amdgcn_trig_preop: + case Intrinsic::amdgcn_log: + case Intrinsic::amdgcn_exp2: return true; default: break; @@ -10233,6 +11221,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, [[fallthrough]]; } default: + // FIXME: denormalsEnabledForType is broken for dynamic return denormalsEnabledForType(DAG, Op.getValueType()) && DAG.isKnownNeverSNaN(Op); } @@ -10254,8 +11243,11 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF, if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) { if (FCR->Value.isSignaling()) return false; - return !FCR->Value.isDenormal() || - denormalsEnabledForType(MRI.getType(FCR->VReg), MF); + if (!FCR->Value.isDenormal()) + return true; + + DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics()); + return Mode == DenormalMode::getIEEE(); } if (MaxDepth == 0) @@ -10298,6 +11290,7 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF, case AMDGPU::G_FMINNUM_IEEE: case AMDGPU::G_FMAXNUM_IEEE: { if (Subtarget->supportsMinMaxDenormModes() || + // FIXME: denormalsEnabledForType is broken for dynamic denormalsEnabledForType(MRI.getType(Reg), MF)) return true; @@ -10316,6 +11309,8 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF, case Intrinsic::amdgcn_fmed3: case Intrinsic::amdgcn_sin: case Intrinsic::amdgcn_cos: + case Intrinsic::amdgcn_log: + case Intrinsic::amdgcn_exp2: case Intrinsic::amdgcn_log_clamp: case Intrinsic::amdgcn_rcp: case Intrinsic::amdgcn_rcp_legacy: @@ -10352,9 +11347,16 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF, SDValue SITargetLowering::getCanonicalConstantFP( SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const { // Flush denormals to 0 if not enabled. - if (C.isDenormal() && !denormalsEnabledForType(DAG, VT)) { - return DAG.getConstantFP(APFloat::getZero(C.getSemantics(), - C.isNegative()), SL, VT); + if (C.isDenormal()) { + DenormalMode Mode = + DAG.getMachineFunction().getDenormalMode(C.getSemantics()); + if (Mode == DenormalMode::getPreserveSign()) { + return DAG.getConstantFP( + APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT); + } + + if (Mode != DenormalMode::getIEEE()) + return SDValue(); } if (C.isNaN()) { @@ -10490,45 +11492,41 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { } } -SDValue SITargetLowering::performIntMed3ImmCombine( - SelectionDAG &DAG, const SDLoc &SL, - SDValue Op0, SDValue Op1, bool Signed) const { - ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1); - if (!K1) - return SDValue(); +SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG, + const SDLoc &SL, SDValue Src, + SDValue MinVal, + SDValue MaxVal, + bool Signed) const { - ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1)); - if (!K0) + // med3 comes from + // min(max(x, K0), K1), K0 < K1 + // max(min(x, K0), K1), K1 < K0 + // + // "MinVal" and "MaxVal" respectively refer to the rhs of the + // min/max op. + ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal); + ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal); + + if (!MinK || !MaxK) return SDValue(); if (Signed) { - if (K0->getAPIntValue().sge(K1->getAPIntValue())) + if (MaxK->getAPIntValue().sge(MinK->getAPIntValue())) return SDValue(); } else { - if (K0->getAPIntValue().uge(K1->getAPIntValue())) + if (MaxK->getAPIntValue().uge(MinK->getAPIntValue())) return SDValue(); } - EVT VT = K0->getValueType(0); + EVT VT = MinK->getValueType(0); unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3; - if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) { - return DAG.getNode(Med3Opc, SL, VT, - Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); - } - - // If there isn't a 16-bit med3 operation, convert to 32-bit. - if (VT == MVT::i16) { - MVT NVT = MVT::i32; - unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - - SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); - SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); - SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); - - SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3); - return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3); - } + if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) + return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal); + // Note: we could also extend to i32 and use i32 med3 if i16 med3 is + // not available, but this is unlikely to be profitable as constants + // will often need to be materialized & extended, especially on + // pre-GFX10 where VOP3 instructions couldn't take literal operands. return SDValue(); } @@ -10640,13 +11638,26 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, } // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) + // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0) if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { - if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true)) + if (SDValue Med3 = performIntMed3ImmCombine( + DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true)) + return Med3; + } + if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) { + if (SDValue Med3 = performIntMed3ImmCombine( + DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true)) return Med3; } if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) { - if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false)) + if (SDValue Med3 = performIntMed3ImmCombine( + DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false)) + return Med3; + } + if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) { + if (SDValue Med3 = performIntMed3ImmCombine( + DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false)) return Med3; } @@ -10930,6 +11941,70 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N, return DAG.getBuildVector(VecVT, SL, Ops); } +/// Return the source of an fp_extend from f16 to f32, or a converted FP +/// constant. +static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) { + if (Src.getOpcode() == ISD::FP_EXTEND && + Src.getOperand(0).getValueType() == MVT::f16) { + return Src.getOperand(0); + } + + if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) { + APFloat Val = CFP->getValueAPF(); + bool LosesInfo = true; + Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo); + if (!LosesInfo) + return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16); + } + + return SDValue(); +} + +SDValue SITargetLowering::performFPRoundCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() && + "combine only useful on gfx8"); + + SDValue TruncSrc = N->getOperand(0); + EVT VT = N->getValueType(0); + if (VT != MVT::f16) + return SDValue(); + + if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 || + TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + + // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3, + // and expanding it with min/max saves 1 instruction vs. casting to f32 and + // casting back. + + // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) => + // fmin(fmax(a, b), fmax(fmin(a, b), c)) + SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0)); + if (!A) + return SDValue(); + + SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1)); + if (!B) + return SDValue(); + + SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2)); + if (!C) + return SDValue(); + + // This changes signaling nan behavior. If an input is a signaling nan, it + // would have been quieted by the fpext originally. We don't care because + // these are unconstrained ops. If we needed to insert quieting canonicalizes + // we would be worse off than just doing the promotion. + SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B); + SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B); + SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C); + return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1); +} + unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const { @@ -10937,10 +12012,11 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, // Only do this if we are not trying to support denormals. v_mad_f32 does not // support denormals ever. - if (((VT == MVT::f32 && !hasFP32Denormals(DAG.getMachineFunction())) || - (VT == MVT::f16 && !hasFP64FP16Denormals(DAG.getMachineFunction()) && - getSubtarget()->hasMadF16())) && - isOperationLegal(ISD::FMAD, VT)) + if (((VT == MVT::f32 && + denormalModeIsFlushAllF32(DAG.getMachineFunction())) || + (VT == MVT::f16 && Subtarget->hasMadF16() && + denormalModeIsFlushAllF64F16(DAG.getMachineFunction()))) && + isOperationLegal(ISD::FMAD, VT)) return ISD::FMAD; const TargetOptions &Options = DAG.getTarget().Options; @@ -11093,7 +12169,6 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, // The actual DAG is noisier than the pseudo code, but only due to // instructions that disassemble values into low and high parts, and // assemble the final result. - SDValue Zero = DAG.getConstant(0, SL, MVT::i32); SDValue One = DAG.getConstant(1, SL, MVT::i32); auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS); @@ -11102,8 +12177,8 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo); if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) { - auto AccumLo = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, Zero); - auto AccumHi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, One); + SDValue AccumLo, AccumHi; + std::tie(AccumLo, AccumHi) = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32); if (!MulLHSUnsigned32) { auto MulLHSHi = @@ -11152,11 +12227,11 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG()) return SDValue(); - // add x, zext (setcc) => addcarry x, 0, setcc - // add x, sext (setcc) => subcarry x, 0, setcc + // add x, zext (setcc) => uaddo_carry x, 0, setcc + // add x, sext (setcc) => usubo_carry x, 0, setcc unsigned Opc = LHS.getOpcode(); if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND || - Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY) + Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY) std::swap(RHS, LHS); Opc = RHS.getOpcode(); @@ -11172,15 +12247,15 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, break; SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond }; - Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY; + Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY; return DAG.getNode(Opc, SL, VTList, Args); } - case ISD::ADDCARRY: { - // add x, (addcarry y, 0, cc) => addcarry x, y, cc - auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); - if (!C || C->getZExtValue() != 0) break; + case ISD::UADDO_CARRY: { + // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc + if (!isNullConstant(RHS.getOperand(1))) + break; SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) }; - return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args); + return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args); } } return SDValue(); @@ -11198,8 +12273,8 @@ SDValue SITargetLowering::performSubCombine(SDNode *N, SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - // sub x, zext (setcc) => subcarry x, 0, setcc - // sub x, sext (setcc) => addcarry x, 0, setcc + // sub x, zext (setcc) => usubo_carry x, 0, setcc + // sub x, sext (setcc) => uaddo_carry x, 0, setcc unsigned Opc = RHS.getOpcode(); switch (Opc) { default: break; @@ -11213,18 +12288,18 @@ SDValue SITargetLowering::performSubCombine(SDNode *N, break; SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond }; - Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::ADDCARRY : ISD::SUBCARRY; + Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY; return DAG.getNode(Opc, SL, VTList, Args); } } - if (LHS.getOpcode() == ISD::SUBCARRY) { - // sub (subcarry x, 0, cc), y => subcarry x, y, cc + if (LHS.getOpcode() == ISD::USUBO_CARRY) { + // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); if (!C || !C->isZero()) return SDValue(); SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) }; - return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args); + return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args); } return SDValue(); } @@ -11235,19 +12310,18 @@ SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N, if (N->getValueType(0) != MVT::i32) return SDValue(); - auto C = dyn_cast<ConstantSDNode>(N->getOperand(1)); - if (!C || C->getZExtValue() != 0) + if (!isNullConstant(N->getOperand(1))) return SDValue(); SelectionDAG &DAG = DCI.DAG; SDValue LHS = N->getOperand(0); - // addcarry (add x, y), 0, cc => addcarry x, y, cc - // subcarry (sub x, y), 0, cc => subcarry x, y, cc + // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc + // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc unsigned LHSOpc = LHS.getOpcode(); unsigned Opc = N->getOpcode(); - if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) || - (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) { + if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) || + (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) { SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) }; return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args); } @@ -11599,8 +12673,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performAddCombine(N, DCI); case ISD::SUB: return performSubCombine(N, DCI); - case ISD::ADDCARRY: - case ISD::SUBCARRY: + case ISD::UADDO_CARRY: + case ISD::USUBO_CARRY: return performAddCarrySubCarryCombine(N, DCI); case ISD::FADD: return performFAddCombine(N, DCI); @@ -11637,12 +12711,12 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performFCanonicalizeCombine(N, DCI); case AMDGPUISD::RCP: return performRcpCombine(N, DCI); + case ISD::FLDEXP: case AMDGPUISD::FRACT: case AMDGPUISD::RSQ: case AMDGPUISD::RCP_LEGACY: case AMDGPUISD::RCP_IFLAG: - case AMDGPUISD::RSQ_CLAMP: - case AMDGPUISD::LDEXP: { + case AMDGPUISD::RSQ_CLAMP: { // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted SDValue Src = N->getOperand(0); if (Src.isUndef()) @@ -11652,6 +12726,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return performUCharToFloatCombine(N, DCI); + case ISD::FCOPYSIGN: + return performFCopySignCombine(N, DCI); case AMDGPUISD::CVT_F32_UBYTE0: case AMDGPUISD::CVT_F32_UBYTE1: case AMDGPUISD::CVT_F32_UBYTE2: @@ -11685,6 +12761,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performExtractVectorEltCombine(N, DCI); case ISD::INSERT_VECTOR_ELT: return performInsertVectorEltCombine(N, DCI); + case ISD::FP_ROUND: + return performFPRoundCombine(N, DCI); case ISD::LOAD: { if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI)) return Widended; @@ -11778,7 +12856,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, // Set which texture component corresponds to the lane. unsigned Comp; for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) { - Comp = countTrailingZeros(Dmask); + Comp = llvm::countr_zero(Dmask); Dmask &= ~(1 << Comp); } @@ -12548,6 +13626,15 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); } + // TODO: Move this logic to getReservedRegs() + // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling. + unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); + Register SReg = ST.isWave32() + ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1) + : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2, + &AMDGPU::SGPR_64RegClass); + Info->setSGPRForEXECCopy(SReg); + assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), Info->getStackPtrOffsetReg())); if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG) @@ -12591,6 +13678,41 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { TargetLoweringBase::finalizeLowering(MF); } +void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op, + KnownBits &Known, + const APInt &DemandedElts, + const SelectionDAG &DAG, + unsigned Depth) const { + Known.resetAll(); + unsigned Opc = Op.getOpcode(); + switch (Opc) { + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + switch (IID) { + case Intrinsic::amdgcn_mbcnt_lo: + case Intrinsic::amdgcn_mbcnt_hi: { + const GCNSubtarget &ST = + DAG.getMachineFunction().getSubtarget<GCNSubtarget>(); + // These return at most the (wavefront size - 1) + src1 + // As long as src1 is an immediate we can calc known bits + KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1); + unsigned Src1ValBits = Src1Known.countMaxActiveBits(); + unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2()); + // Cater for potential carry + MaxActiveBits += Src1ValBits ? 1 : 0; + unsigned Size = Op.getValueType().getSizeInBits(); + if (MaxActiveBits < Size) + Known.Zero.setHighBits(Size - MaxActiveBits); + return; + } + } + break; + } + } + return AMDGPUTargetLowering::computeKnownBitsForTargetNode( + Op, Known, DemandedElts, DAG, Depth); +} + void SITargetLowering::computeKnownBitsForFrameIndex( const int FI, KnownBits &Known, const MachineFunction &MF) const { TargetLowering::computeKnownBitsForFrameIndex(FI, Known, MF); @@ -12605,7 +13727,7 @@ static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim) { unsigned MaxValue = ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim); - Known.Zero.setHighBits(countLeadingZeros(MaxValue)); + Known.Zero.setHighBits(llvm::countl_zero(MaxValue)); } void SITargetLowering::computeKnownBitsForTargetInstr( @@ -12636,7 +13758,7 @@ void SITargetLowering::computeKnownBitsForTargetInstr( // based on the actual size because we don't know if it's accurate or not // at any given point. Known.Zero.setHighBits( - countLeadingZeros(getSubtarget()->getAddressableLocalMemorySize())); + llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize())); break; } } @@ -12648,6 +13770,30 @@ void SITargetLowering::computeKnownBitsForTargetInstr( case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: Known.Zero.setHighBits(16); break; + case AMDGPU::G_AMDGPU_SMED3: + case AMDGPU::G_AMDGPU_UMED3: { + auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs(); + + KnownBits Known2; + KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1); + if (Known2.isUnknown()) + break; + + KnownBits Known1; + KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1); + if (Known1.isUnknown()) + break; + + KnownBits Known0; + KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1); + if (Known0.isUnknown()) + break; + + // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling. + Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero; + Known.One = Known0.One & Known1.One & Known2.One; + break; + } } } @@ -12759,9 +13905,9 @@ static bool isCopyFromRegOfInlineAsm(const SDNode *N) { return false; } -bool SITargetLowering::isSDNodeSourceOfDivergence( - const SDNode *N, FunctionLoweringInfo *FLI, - LegacyDivergenceAnalysis *KDA) const { +bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N, + FunctionLoweringInfo *FLI, + UniformityInfo *UA) const { switch (N->getOpcode()) { case ISD::CopyFromReg: { const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1)); @@ -12774,7 +13920,7 @@ bool SITargetLowering::isSDNodeSourceOfDivergence( return !TRI->isSGPRReg(MRI, Reg); if (const Value *V = FLI->getValueFromVirtualReg(R->getReg())) - return KDA->isDivergent(V); + return UA->isDivergent(V); assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N)); return !TRI->isSGPRReg(MRI, Reg); @@ -12794,8 +13940,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence( return AMDGPU::isIntrinsicSourceOfDivergence( cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()); case AMDGPUISD::ATOMIC_CMP_SWAP: - case AMDGPUISD::ATOMIC_INC: - case AMDGPUISD::ATOMIC_DEC: case AMDGPUISD::ATOMIC_LOAD_FMIN: case AMDGPUISD::ATOMIC_LOAD_FMAX: case AMDGPUISD::BUFFER_ATOMIC_SWAP: @@ -12830,10 +13974,10 @@ bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const { switch (VT.getScalarType().getSimpleVT().SimpleTy) { case MVT::f32: - return hasFP32Denormals(DAG.getMachineFunction()); + return !denormalModeIsFlushAllF32(DAG.getMachineFunction()); case MVT::f64: case MVT::f16: - return hasFP64FP16Denormals(DAG.getMachineFunction()); + return !denormalModeIsFlushAllF64F16(DAG.getMachineFunction()); default: return false; } @@ -12843,10 +13987,10 @@ bool SITargetLowering::denormalsEnabledForType(LLT Ty, MachineFunction &MF) const { switch (Ty.getScalarSizeInBits()) { case 32: - return hasFP32Denormals(MF); + return !denormalModeIsFlushAllF32(MF); case 64: case 16: - return hasFP64FP16Denormals(MF); + return !denormalModeIsFlushAllF64F16(MF); default: return false; } @@ -12930,6 +14074,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { if (AMDGPU::isFlatGlobalAddrSpace(AS) && Subtarget->hasAtomicFaddNoRtnInsts()) { + if (Subtarget->hasGFX940Insts()) + return AtomicExpansionKind::None; + if (unsafeFPAtomicsDisabled(RMW->getFunction())) return AtomicExpansionKind::CmpXChg; @@ -13054,6 +14201,8 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { // uniform values (as produced by the mask results of control flow intrinsics) // used outside of divergent blocks. The phi users need to also be treated as // always uniform. +// +// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis? static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited, unsigned WaveSize) { // FIXME: We assume we never cast the mask results of a control flow @@ -13153,6 +14302,11 @@ bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0, hasMemSDNodeUser(*N0->use_begin())); } +bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI, + Register N0, Register N1) const { + return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks +} + MachineMemOperand::Flags SITargetLowering::getTargetMMOFlags(const Instruction &I) const { // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load. @@ -13196,37 +14350,36 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { assert(AI->getOperation() == AtomicRMWInst::FAdd && "only fadd is supported for now"); - // Given: atomicrmw fadd float* %addr, float %val ordering + // Given: atomicrmw fadd ptr %addr, float %val ordering // // With this expansion we produce the following code: // [...] - // %int8ptr = bitcast float* %addr to i8* // br label %atomicrmw.check.shared // // atomicrmw.check.shared: - // %is.shared = call i1 @llvm.amdgcn.is.shared(i8* %int8ptr) + // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr) // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private // // atomicrmw.shared: - // %cast.shared = addrspacecast float* %addr to float addrspace(3)* - // %loaded.shared = atomicrmw fadd float addrspace(3)* %cast.shared, + // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3) + // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared, // float %val ordering // br label %atomicrmw.phi // // atomicrmw.check.private: - // %is.private = call i1 @llvm.amdgcn.is.private(i8* %int8ptr) + // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr) // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global // // atomicrmw.private: - // %cast.private = addrspacecast float* %addr to float addrspace(5)* - // %loaded.private = load float, float addrspace(5)* %cast.private + // %cast.private = addrspacecast ptr %addr to ptr addrspace(5) + // %loaded.private = load float, ptr addrspace(5) %cast.private // %val.new = fadd float %loaded.private, %val - // store float %val.new, float addrspace(5)* %cast.private + // store float %val.new, ptr addrspace(5) %cast.private // br label %atomicrmw.phi // // atomicrmw.global: - // %cast.global = addrspacecast float* %addr to float addrspace(1)* - // %loaded.global = atomicrmw fadd float addrspace(1)* %cast.global, + // %cast.global = addrspacecast ptr %addr to ptr addrspace(1) + // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global, // float %val ordering // br label %atomicrmw.phi // @@ -13259,7 +14412,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { Value *Val = AI->getValOperand(); Type *ValTy = Val->getType(); Value *Addr = AI->getPointerOperand(); - PointerType *PtrTy = cast<PointerType>(Addr->getType()); auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr, Value *Val) -> Value * { @@ -13275,30 +14427,27 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { std::prev(BB->end())->eraseFromParent(); Builder.SetInsertPoint(BB); - Value *Int8Ptr = Builder.CreateBitCast(Addr, Builder.getInt8PtrTy()); Builder.CreateBr(CheckSharedBB); Builder.SetInsertPoint(CheckSharedBB); CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {}, - {Int8Ptr}, nullptr, "is.shared"); + {Addr}, nullptr, "is.shared"); Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB); Builder.SetInsertPoint(SharedBB); Value *CastToLocal = Builder.CreateAddrSpaceCast( - Addr, - PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::LOCAL_ADDRESS)); + Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS)); Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val); Builder.CreateBr(PhiBB); Builder.SetInsertPoint(CheckPrivateBB); CallInst *IsPrivate = Builder.CreateIntrinsic( - Intrinsic::amdgcn_is_private, {}, {Int8Ptr}, nullptr, "is.private"); + Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private"); Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB); Builder.SetInsertPoint(PrivateBB); Value *CastToPrivate = Builder.CreateAddrSpaceCast( - Addr, - PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::PRIVATE_ADDRESS)); + Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS)); Value *LoadedPrivate = Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private"); Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new"); @@ -13307,8 +14456,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { Builder.SetInsertPoint(GlobalBB); Value *CastToGlobal = Builder.CreateAddrSpaceCast( - Addr, - PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::GLOBAL_ADDRESS)); + Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS)); Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val); Builder.CreateBr(PhiBB); @@ -13322,3 +14470,25 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const { AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); } + +LoadInst * +SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { + IRBuilder<> Builder(AI); + auto Order = AI->getOrdering(); + + // The optimization removes store aspect of the atomicrmw. Therefore, cache + // must be flushed if the atomic ordering had a release semantics. This is + // not necessary a fence, a release fence just coincides to do that flush. + // Avoid replacing of an atomicrmw with a release semantics. + if (isReleaseOrStronger(Order)) + return nullptr; + + LoadInst *LI = Builder.CreateAlignedLoad( + AI->getType(), AI->getPointerOperand(), AI->getAlign()); + LI->setAtomic(Order, AI->getSyncScopeID()); + LI->copyMetadata(*AI); + LI->takeName(AI); + AI->replaceAllUsesWith(LI); + AI->eraseFromParent(); + return LI; +} diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 3b2c58108667..1745c0b9e88e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -87,8 +87,6 @@ private: SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; - SDValue makeV_ILLEGAL(SDValue Op, SelectionDAG &DAG) const; - // The raw.tbuffer and struct.tbuffer intrinsics have two offset args: offset // (the offset that is included in bounds checking and swizzling, to be split // between the instruction's voffset and immoffset fields) and soffset (the @@ -108,8 +106,10 @@ private: SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFFREXP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; @@ -143,6 +143,7 @@ private: /// Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const; SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const; SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const; @@ -167,6 +168,8 @@ private: SDValue performUCharToFloatCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFCopySignCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSHLPtrCombine(SDNode *N, unsigned AS, EVT MemVT, @@ -191,12 +194,14 @@ private: SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, SDValue Op0, SDValue Op1) const; SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, - SDValue Op0, SDValue Op1, bool Signed) const; + SDValue Src, SDValue MinVal, SDValue MaxVal, + bool Signed) const; SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const; unsigned getFusedOpcode(const SelectionDAG &DAG, @@ -250,6 +255,17 @@ private: void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, SDValue *Offsets, Align Alignment = Align(4)) const; + // Convert the i128 that an addrspace(8) pointer is natively represented as + // into the v4i32 that all the buffer intrinsics expect to receive. We can't + // add register classes for i128 on pain of the promotion logic going haywire, + // so this slightly ugly hack is what we've got. If passed a non-pointer + // argument (as would be seen in older buffer intrinsics), does nothing. + SDValue bufferRsrcPtrToVector(SDValue MaybePointer, SelectionDAG &DAG) const; + + // Wrap a 64-bit pointer into a v4i32 (which is how all SelectionDAG code + // represents ptr addrspace(8)) using the flags specified in the intrinsic. + SDValue lowerPointerAsRsrcIntrin(SDNode *Op, SelectionDAG &DAG) const; + // Handle 8 bit and 16 bit buffer loads SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL, ArrayRef<SDValue> Ops, MemSDNode *M) const; @@ -272,6 +288,12 @@ public: bool isShuffleMaskLegal(ArrayRef<int> /*Mask*/, EVT /*VT*/) const override; + // While address space 7 should never make it to codegen, it still needs to + // have a MVT to prevent some analyses that query this function from breaking, + // so, to work around the lack of i160, map it to v5i32. + MVT getPointerTy(const DataLayout &DL, unsigned AS) const override; + MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override; + bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override; @@ -331,6 +353,12 @@ public: bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; + unsigned combineRepeatedFPDivisors() const override { + // Combine multiple FDIVs with the same divisor into multiple FMULs by the + // reciprocal. + return 2; + } + bool supportSplitCSR(MachineFunction *MF) const override; void initializeSplitCSR(MachineBasicBlock *Entry) const override; void insertCopiesSplitCSR( @@ -361,7 +389,7 @@ public: SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const; - SDValue LowerCallResult(SDValue Chain, SDValue InFlag, + SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, SelectionDAG &DAG, @@ -396,7 +424,6 @@ public: EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override; - bool hasBitPreservingFPLogic(EVT VT) const override; bool hasAtomicFaddRtnForTy(SDValue &Op) const; bool enableAggressiveFMAFusion(EVT VT) const override; bool enableAggressiveFMAFusion(LLT Ty) const override; @@ -452,6 +479,10 @@ public: void finalizeLowering(MachineFunction &MF) const override; + void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, + const APInt &DemandedElts, + const SelectionDAG &DAG, + unsigned Depth = 0) const override; void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override; @@ -464,14 +495,17 @@ public: Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth = 0) const override; - bool isSDNodeSourceOfDivergence(const SDNode *N, - FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override; + bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, + UniformityInfo *UA) const override; bool hasMemSDNodeUser(SDNode *N) const; bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override; + bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, + Register N1) const override; + bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth = 5) const; bool isCanonicalized(Register Reg, MachineFunction &MF, @@ -495,6 +529,9 @@ public: shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; void emitExpandAtomicRMW(AtomicRMWInst *AI) const override; + LoadInst * + lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; + const TargetRegisterClass *getRegClassFor(MVT VT, bool isDivergent) const override; bool requiresUniformRegister(MachineFunction &MF, diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 851c407bb255..4b0283b27a6f 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -35,7 +35,7 @@ #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/InitializePasses.h" #include "llvm/Support/DebugCounter.h" -#include "llvm/Support/TargetParser.h" +#include "llvm/TargetParser/TargetParser.h" using namespace llvm; #define DEBUG_TYPE "si-insert-waitcnts" @@ -57,8 +57,6 @@ namespace { // associated with the operand. Used for determining whether // s_waitcnt instruction needs to be emitted. -#define CNT_MASK(t) (1u << (t)) - enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS }; } // namespace @@ -88,19 +86,20 @@ struct RegisterEncoding { }; enum WaitEventType { - VMEM_ACCESS, // vector-memory read & write - VMEM_READ_ACCESS, // vector-memory read - VMEM_WRITE_ACCESS, // vector-memory write - LDS_ACCESS, // lds read & write - GDS_ACCESS, // gds read & write - SQ_MESSAGE, // send message - SMEM_ACCESS, // scalar-memory read & write - EXP_GPR_LOCK, // export holding on its data src - GDS_GPR_LOCK, // GDS holding on its data and addr src - EXP_POS_ACCESS, // write to export position - EXP_PARAM_ACCESS, // write to export parameter - VMW_GPR_LOCK, // vector-memory write holding on its data src - EXP_LDS_ACCESS, // read by ldsdir counting as export + VMEM_ACCESS, // vector-memory read & write + VMEM_READ_ACCESS, // vector-memory read + VMEM_WRITE_ACCESS, // vector-memory write that is not scratch + SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch + LDS_ACCESS, // lds read & write + GDS_ACCESS, // gds read & write + SQ_MESSAGE, // send message + SMEM_ACCESS, // scalar-memory read & write + EXP_GPR_LOCK, // export holding on its data src + GDS_GPR_LOCK, // GDS holding on its data and addr src + EXP_POS_ACCESS, // write to export position + EXP_PARAM_ACCESS, // write to export parameter + VMW_GPR_LOCK, // vector-memory write holding on its data src + EXP_LDS_ACCESS, // read by ldsdir counting as export NUM_WAIT_EVENTS, }; @@ -110,7 +109,7 @@ static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = { (1 << SQ_MESSAGE), (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS), - (1 << VMEM_WRITE_ACCESS)}; + (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS)}; // The mapping is: // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs @@ -372,11 +371,8 @@ private: MachinePostDominatorTree *PDT; struct BlockInfo { - MachineBasicBlock *MBB; std::unique_ptr<WaitcntBrackets> Incoming; bool Dirty = true; - - explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {} }; MapVector<MachineBasicBlock *, BlockInfo> BlockInfos; @@ -386,6 +382,10 @@ private: bool ForceEmitZeroWaitcnts; bool ForceEmitWaitcnt[NUM_INST_CNTS]; + // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS + // message. + DenseSet<MachineInstr *> ReleaseVGPRInsts; + public: static char ID; @@ -398,6 +398,7 @@ public: bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets); bool isPreheaderToFlush(MachineBasicBlock &MBB, WaitcntBrackets &ScoreBrackets); + bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -418,10 +419,6 @@ public: return false; } - AMDGPU::Waitcnt allZeroWaitcnt() const { - return AMDGPU::Waitcnt::allZero(ST->hasVscnt()); - } - void setForceEmitWaitcnt() { // For non-debug builds, ForceEmitWaitcnt has been initialized to false; // For debug builds, get the debug counter info and adjust if need be @@ -455,13 +452,19 @@ public: assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst)); if (!ST->hasVscnt()) return VMEM_ACCESS; - if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) + if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) { + // FLAT and SCRATCH instructions may access scratch. Other VMEM + // instructions do not. + if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst)) + return SCRATCH_WRITE_ACCESS; return VMEM_WRITE_ACCESS; + } return VMEM_READ_ACCESS; } bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; + bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, MachineInstr *OldWaitcntInstr, @@ -1029,7 +1032,18 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, MI.getOpcode() == AMDGPU::SI_RETURN || MI.getOpcode() == AMDGPU::S_SETPC_B64_return || (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { - Wait = Wait.combined(allZeroWaitcnt()); + Wait = Wait.combined(AMDGPU::Waitcnt::allZeroExceptVsCnt()); + } + // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM + // stores. In this case it can be useful to send a message to explicitly + // release all VGPRs before the stores have completed, but it is only safe to + // do this if there are no outstanding scratch stores. + else if (MI.getOpcode() == AMDGPU::S_ENDPGM || + MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { + if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && + ScoreBrackets.getScoreRange(VS_CNT) != 0 && + !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS)) + ReleaseVGPRInsts.insert(&MI); } // Resolve vm waits before gs-done. else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || @@ -1214,7 +1228,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here. if (MI.getOpcode() == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { - Wait = Wait.combined(allZeroWaitcnt()); + Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt())); } // TODO: Remove this work-around, enable the assert for Bug 457939 @@ -1230,7 +1244,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, ScoreBrackets.simplifyWaitcnt(Wait); if (ForceEmitZeroWaitcnts) - Wait = allZeroWaitcnt(); + Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt(); if (ForceEmitWaitcnt[VM_CNT]) Wait.VmCnt = 0; @@ -1238,8 +1252,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, Wait.ExpCnt = 0; if (ForceEmitWaitcnt[LGKM_CNT]) Wait.LgkmCnt = 0; - if (ForceEmitWaitcnt[VS_CNT]) - Wait.VsCnt = 0; if (FlushVmCnt) { if (ScoreBrackets.hasPendingEvent(VM_CNT)) @@ -1384,6 +1396,32 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { return false; } +// This is a flat memory operation. Check to see if it has memory tokens for +// either scratch or FLAT. +bool SIInsertWaitcnts::mayAccessScratchThroughFlat( + const MachineInstr &MI) const { + assert(TII->isFLAT(MI)); + + // SCRATCH instructions always access scratch. + if (TII->isFLATScratch(MI)) + return true; + + // GLOBAL instructions never access scratch. + if (TII->isFLATGlobal(MI)) + return false; + + // If there are no memory operands then conservatively assume the flat + // operation may access scratch. + if (MI.memoperands_empty()) + return true; + + // See if any memory operand specifies an address space that involves scratch. + return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) { + unsigned AS = Memop->getAddrSpace(); + return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; + }); +} + void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, WaitcntBrackets *ScoreBrackets) { // Now look at the instruction opcode. If it is a memory access @@ -1436,7 +1474,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, } else if (Inst.isCall()) { if (callWaitsOnFunctionReturn(Inst)) { // Act as a wait on everything - ScoreBrackets->applyWaitcnt(allZeroWaitcnt()); + ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt()); } else { // May need to way wait for anything. ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt()); @@ -1703,6 +1741,11 @@ bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB, return UpdateCache(false); } +bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const { + return SIInstrInfo::isVMEM(MI) || + (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI)); +} + // Return true if it is better to flush the vmcnt counter in the preheader of // the given loop. We currently decide to flush in two situations: // 1. The loop contains vmem store(s), no vmem load and at least one use of a @@ -1721,7 +1764,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, for (MachineBasicBlock *MBB : ML->blocks()) { for (MachineInstr &MI : *MBB) { - if (SIInstrInfo::isVMEM(MI)) { + if (isVMEMOrFlatVMEM(MI)) { if (MI.mayLoad()) HasVMemLoad = true; if (MI.mayStore()) @@ -1749,7 +1792,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, } } // VMem load vgpr def - else if (SIInstrInfo::isVMEM(MI) && MI.mayLoad() && Op.isDef()) + else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef()) for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { // If we find a register that is loaded inside the loop, 1. and 2. // are invalidated and we can exit. @@ -1813,10 +1856,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { I != E && (I->isPHI() || I->isMetaInstruction()); ++I) ; BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); - if (ST->hasVscnt()) - BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) - .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(0); Modified = true; } @@ -1824,7 +1863,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { // Keep iterating over the blocks in reverse post order, inserting and // updating s_waitcnt where needed, until a fix point is reached. for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF)) - BlockInfos.insert({MBB, BlockInfo(MBB)}); + BlockInfos.insert({MBB, BlockInfo()}); std::unique_ptr<WaitcntBrackets> Brackets; bool Repeat; @@ -1833,6 +1872,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE; ++BII) { + MachineBasicBlock *MBB = BII->first; BlockInfo &BI = BII->second; if (!BI.Dirty) continue; @@ -1849,12 +1889,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { *Brackets = WaitcntBrackets(ST, Limits, Encoding); } - Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets); + Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets); BI.Dirty = false; if (Brackets->hasPendingEvent()) { BlockInfo *MoveBracketsToSucc = nullptr; - for (MachineBasicBlock *Succ : BI.MBB->successors()) { + for (MachineBasicBlock *Succ : MBB->successors()) { auto SuccBII = BlockInfos.find(Succ); BlockInfo &SuccBI = SuccBII->second; if (!SuccBI.Incoming) { @@ -1924,5 +1964,18 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { } } + // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM + // instructions. + for (MachineInstr *MI : ReleaseVGPRInsts) { + if (ST->requiresNopBeforeDeallocVGPRs()) { + BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_NOP)) + .addImm(0); + } + BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_SENDMSG)) + .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus); + Modified = true; + } + ReleaseVGPRInsts.clear(); + return Modified; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td index d86d4e659803..f674777724eb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -153,6 +153,9 @@ class InstSI <dag outs, dag ins, string asm = "", // This bit indicates that tied source will not be read. field bit TiedSourceNotRead = 0; + // This bit indicates that the instruction is never-uniform/divergent + field bit IsNeverUniform = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -234,6 +237,8 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{60} = TiedSourceNotRead; + let TSFlags{61} = IsNeverUniform; + let SchedRW = [Write32Bit]; let AsmVariantName = AMDGPUAsmVariants.Default; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 2cbc90219334..278cf2b69ee3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -329,8 +329,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth( const MachineOperand *Offset1Op = getNamedOperand(LdSt, AMDGPU::OpName::offset1); - unsigned Offset0 = Offset0Op->getImm(); - unsigned Offset1 = Offset1Op->getImm(); + unsigned Offset0 = Offset0Op->getImm() & 0xff; + unsigned Offset1 = Offset1Op->getImm() & 0xff; if (Offset0 + 1 != Offset1) return false; @@ -537,7 +537,7 @@ static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, - const char *Msg = "illegal SGPR to VGPR copy") { + const char *Msg = "illegal VGPR to SGPR copy") { MachineFunction *MF = MBB.getParent(); DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error); LLVMContext &C = MF->getFunction().getContext(); @@ -578,9 +578,12 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII, if (!RegsOverlap) { for (auto Def = MI, E = MBB.begin(); Def != E; ) { --Def; - if (!Def->definesRegister(SrcReg, &RI)) + + if (!Def->modifiesRegister(SrcReg, &RI)) continue; - if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) + + if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 || + Def->getOperand(0).getReg() != SrcReg) break; MachineOperand &DefOp = Def->getOperand(1); @@ -615,8 +618,8 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII, } } - RS.enterBasicBlock(MBB); - RS.forward(MI); + RS.enterBasicBlockEnd(MBB); + RS.backward(MI); // Ideally we want to have three registers for a long reg_sequence copy // to hide 2 waitstates between v_mov_b32 and accvgpr_write. @@ -631,11 +634,12 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII, assert(MBB.getParent()->getRegInfo().isReserved(Tmp) && "VGPR used for an intermediate copy should have been reserved."); - // Only loop through if there are any free registers left, otherwise - // scavenger may report a fatal error without emergency spill slot - // or spill with the slot. - while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { - Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); + // Only loop through if there are any free registers left. We don't want to + // spill. + while (RegNo--) { + Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, + /* RestoreAfter */ false, 0, + /* AllowSpill */ false); if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) break; Tmp = Tmp2; @@ -1394,6 +1398,14 @@ static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) { return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5; if (VecSize <= 256) // 32 bytes return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8; + if (VecSize <= 288) // 36 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9; + if (VecSize <= 320) // 40 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10; + if (VecSize <= 352) // 44 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11; + if (VecSize <= 384) // 48 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12; if (VecSize <= 512) // 64 bytes return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16; if (VecSize <= 1024) // 128 bytes @@ -1575,6 +1587,30 @@ static unsigned getAVSpillSaveOpcode(unsigned Size) { } } +static unsigned getWWMRegSpillSaveOpcode(unsigned Size) { + // Currently, there is only 32-bit WWM register spills needed. + if (Size != 4) + llvm_unreachable("unknown wwm register spill size"); + + return AMDGPU::SI_SPILL_WWM_V32_SAVE; +} + +static unsigned getVectorRegSpillSaveOpcode(Register Reg, + const TargetRegisterClass *RC, + unsigned Size, + const SIRegisterInfo &TRI, + const SIMachineFunctionInfo &MFI) { + // Choose the right opcode if spilling a WWM register. + if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) + return getWWMRegSpillSaveOpcode(Size); + + if (TRI.isVectorSuperClass(RC)) + return getAVSpillSaveOpcode(Size); + + return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size) + : getVGPRSpillSaveOpcode(Size); +} + void SIInstrInfo::storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, @@ -1619,11 +1655,8 @@ void SIInstrInfo::storeRegToStackSlot( return; } - unsigned Opcode = RI.isVectorSuperClass(RC) - ? getAVSpillSaveOpcode(SpillSize) - : RI.isAGPRClass(RC) - ? getAGPRSpillSaveOpcode(SpillSize) - : getVGPRSpillSaveOpcode(SpillSize); + unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, + SpillSize, RI, *MFI); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) @@ -1774,6 +1807,29 @@ static unsigned getAVSpillRestoreOpcode(unsigned Size) { } } +static unsigned getWWMRegSpillRestoreOpcode(unsigned Size) { + // Currently, there is only 32-bit WWM register spills needed. + if (Size != 4) + llvm_unreachable("unknown wwm register spill size"); + + return AMDGPU::SI_SPILL_WWM_V32_RESTORE; +} + +static unsigned +getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, + unsigned Size, const SIRegisterInfo &TRI, + const SIMachineFunctionInfo &MFI) { + // Choose the right opcode if restoring a WWM register. + if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) + return getWWMRegSpillRestoreOpcode(Size); + + if (TRI.isVectorSuperClass(RC)) + return getAVSpillRestoreOpcode(Size); + + return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size) + : getVGPRSpillRestoreOpcode(Size); +} + void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, @@ -1817,11 +1873,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, return; } - unsigned Opcode = RI.isVectorSuperClass(RC) - ? getAVSpillRestoreOpcode(SpillSize) - : RI.isAGPRClass(RC) - ? getAGPRSpillRestoreOpcode(SpillSize) - : getVGPRSpillRestoreOpcode(SpillSize); + unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC, + SpillSize, RI, *MFI); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // vaddr .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset @@ -1941,6 +1994,18 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(AMDGPU::S_AND_B32)); break; + case AMDGPU::S_AND_SAVEEXEC_B64_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64)); + break; + + case AMDGPU::S_AND_SAVEEXEC_B32_term: + // This is only a terminator to get the correct spill code placement during + // register allocation. + MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32)); + break; + case AMDGPU::V_MOV_B64_PSEUDO: { Register Dst = MI.getOperand(0).getReg(); Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); @@ -2084,6 +2149,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4: case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5: case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8: + case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9: + case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10: + case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11: + case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12: case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16: case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32: case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1: @@ -2345,6 +2414,14 @@ SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { return std::pair(Split[0], Split[1]); } +std::optional<DestSourcePair> +SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { + if (MI.getOpcode() == AMDGPU::WWM_COPY) + return DestSourcePair{MI.getOperand(0), MI.getOperand(1)}; + + return std::nullopt; +} + bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, @@ -2522,6 +2599,7 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, MachineFunction *MF = MBB.getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); + const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); // FIXME: Virtual register workaround for RegScavenger not working with empty // blocks. @@ -2555,12 +2633,6 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) .addReg(PCReg); - // FIXME: If spilling is necessary, this will fail because this scavenger has - // no emergency stack slots. It is non-trivial to spill in this situation, - // because the restore code needs to be specially placed after the - // jump. BranchRelaxation then needs to be made aware of the newly inserted - // block. - // // If a spill is needed for the pc register pair, we need to insert a spill // restore block right before the destination block, and insert a short branch // into the old destination block's fallthrough predecessor. @@ -2591,10 +2663,20 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, // dest_bb: // buzz; - RS->enterBasicBlockEnd(MBB); - Register Scav = RS->scavengeRegisterBackwards( - AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC), - /* RestoreAfter */ false, 0, /* AllowSpill */ false); + Register LongBranchReservedReg = MFI->getLongBranchReservedReg(); + Register Scav; + + // If we've previously reserved a register for long branches + // avoid running the scavenger and just use those registers + if (LongBranchReservedReg) { + RS->enterBasicBlock(MBB); + Scav = LongBranchReservedReg; + } else { + RS->enterBasicBlockEnd(MBB); + Scav = RS->scavengeRegisterBackwards( + AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC), + /* RestoreAfter */ false, 0, /* AllowSpill */ false); + } if (Scav) { RS->setRegUsed(Scav); MRI.replaceRegWith(PCReg, Scav); @@ -2720,11 +2802,13 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, case AMDGPU::S_OR_B64_term: case AMDGPU::S_ANDN2_B64_term: case AMDGPU::S_AND_B64_term: + case AMDGPU::S_AND_SAVEEXEC_B64_term: case AMDGPU::S_MOV_B32_term: case AMDGPU::S_XOR_B32_term: case AMDGPU::S_OR_B32_term: case AMDGPU::S_ANDN2_B32_term: case AMDGPU::S_AND_B32_term: + case AMDGPU::S_AND_SAVEEXEC_B32_term: break; case AMDGPU::SI_IF: case AMDGPU::SI_ELSE: @@ -2858,7 +2942,7 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, if (MRI.getRegClass(FalseReg) != RC) return false; - int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; + int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32; CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? // Limit to equal cost for branch vs. N v_cndmask_b32s. @@ -2873,7 +2957,7 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, if (MRI.getRegClass(FalseReg) != RC) return false; - int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; + int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32; // Multiples of 8 can do s_cselect_b64 if (NumInsts % 2 == 0) @@ -3004,6 +3088,7 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) { case AMDGPU::S_MOV_B32: case AMDGPU::S_MOV_B64: case AMDGPU::COPY: + case AMDGPU::WWM_COPY: case AMDGPU::V_ACCVGPR_WRITE_B32_e64: case AMDGPU::V_ACCVGPR_READ_B32_e64: case AMDGPU::V_ACCVGPR_MOV_B32: @@ -3084,7 +3169,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, assert(UseMI.getOperand(1).getReg().isVirtual()); } - UseMI.setDesc(get(NewOpc)); + const MCInstrDesc &NewMCID = get(NewOpc); + if (DstReg.isPhysical() && + !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg)) + return false; + + UseMI.setDesc(NewMCID); UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue()); UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); return true; @@ -4352,7 +4442,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, // Adjust for packed 16 bit values if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) - RegCount >>= 1; + RegCount = divideCeil(RegCount, 2); // Adjust if using LWE or TFE if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) @@ -4365,7 +4455,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; if (RegCount > DstSize) { - ErrInfo = "MIMG instruction returns too many registers for dst " + ErrInfo = "Image instruction returns too many registers for dst " "register class"; return false; } @@ -4636,9 +4726,12 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, unsigned VAddrWords; if (IsNSA) { VAddrWords = SRsrcIdx - VAddr0Idx; + if (ST.hasPartialNSAEncoding() && AddrWords > ST.getNSAMaxSize()) { + unsigned LastVAddrIdx = SRsrcIdx - 1; + VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1; + } } else { - const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx); - VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32; + VAddrWords = getOpSize(MI, VAddr0Idx) / 4; if (AddrWords > 12) AddrWords = 16; } @@ -4881,6 +4974,51 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { "Unexpected scalar opcode without corresponding vector one!"); } +void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register Reg, + bool IsSCCLive, + SlotIndexes *Indexes) const { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + bool IsWave32 = ST.isWave32(); + if (IsSCCLive) { + // Insert two move instructions, one to save the original value of EXEC and + // the other to turn on all bits in EXEC. This is required as we can't use + // the single instruction S_OR_SAVEEXEC that clobbers SCC. + unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg) + .addReg(Exec, RegState::Kill); + auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); + if (Indexes) { + Indexes->insertMachineInstrInMaps(*StoreExecMI); + Indexes->insertMachineInstrInMaps(*FlipExecMI); + } + } else { + const unsigned OrSaveExec = + IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; + auto SaveExec = + BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1); + SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. + if (Indexes) + Indexes->insertMachineInstrInMaps(*SaveExec); + } +} + +void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register Reg, + SlotIndexes *Indexes) const { + unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; + MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + auto ExecRestoreMI = + BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill); + if (Indexes) + Indexes->insertMachineInstrInMaps(*ExecRestoreMI); +} + static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MachineRegisterInfo &MRI, @@ -4979,12 +5117,6 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); - const TargetRegisterClass *VRC64 = RI.getVGPR64Class(); - if (RI.getCommonSubClass(VRC64, VRC)) - VRC = VRC64; - else - VRC = &AMDGPU::VGPR_32RegClass; - Register Reg = MRI.createVirtualRegister(VRC); DebugLoc DL = MBB->findDebugLoc(I); BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); @@ -5585,13 +5717,12 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, } // Emit the actual waterfall loop, executing the wrapped instruction for each -// unique value of \p Rsrc across all lanes. In the best case we execute 1 +// unique value of \p ScalarOps across all lanes. In the best case we execute 1 // iteration, in the worst case we execute 64 (once per lane). -static void -emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, - MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, - MachineBasicBlock &BodyBB, const DebugLoc &DL, - MachineOperand &Rsrc) { +static void emitLoadScalarOpsFromVGPRLoop( + const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, + MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, + ArrayRef<MachineOperand *> ScalarOps) { MachineFunction &MF = *OrigBB.getParent(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -5609,72 +5740,105 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, SmallVector<Register, 8> ReadlanePieces; Register CondReg; - Register VRsrc = Rsrc.getReg(); - unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); + for (MachineOperand *ScalarOp : ScalarOps) { + unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI); + unsigned NumSubRegs = RegSize / 32; + Register VScalarOp = ScalarOp->getReg(); + + if (NumSubRegs == 1) { + Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg) + .addReg(VScalarOp); + + Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC); + + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg) + .addReg(CurReg) + .addReg(VScalarOp); - unsigned RegSize = TRI->getRegSizeInBits(Rsrc.getReg(), MRI); - unsigned NumSubRegs = RegSize / 32; - assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size"); + // Combine the comparison results with AND. + if (!CondReg) // First. + CondReg = NewCondReg; + else { // If not the first, we create an AND. + Register AndReg = MRI.createVirtualRegister(BoolXExecRC); + BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg) + .addReg(CondReg) + .addReg(NewCondReg); + CondReg = AndReg; + } - for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) { + // Update ScalarOp operand to use the SGPR ScalarOp. + ScalarOp->setReg(CurReg); + ScalarOp->setIsKill(); + } else { + unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef()); + assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && + "Unhandled register size"); - Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) { + Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - // Read the next variant <- also loop target. - BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo) - .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx)); + // Read the next variant <- also loop target. + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo) + .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx)); - // Read the next variant <- also loop target. - BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi) - .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx + 1)); + // Read the next variant <- also loop target. + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi) + .addReg(VScalarOp, VScalarOpUndef, + TRI->getSubRegFromChannel(Idx + 1)); - ReadlanePieces.push_back(CurRegLo); - ReadlanePieces.push_back(CurRegHi); + ReadlanePieces.push_back(CurRegLo); + ReadlanePieces.push_back(CurRegHi); - // Comparison is to be done as 64-bit. - Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); - BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg) + // Comparison is to be done as 64-bit. + Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg) .addReg(CurRegLo) .addImm(AMDGPU::sub0) .addReg(CurRegHi) .addImm(AMDGPU::sub1); - Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC); - auto Cmp = - BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg) - .addReg(CurReg); - if (NumSubRegs <= 2) - Cmp.addReg(VRsrc); - else - Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2)); + Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC); + auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), + NewCondReg) + .addReg(CurReg); + if (NumSubRegs <= 2) + Cmp.addReg(VScalarOp); + else + Cmp.addReg(VScalarOp, VScalarOpUndef, + TRI->getSubRegFromChannel(Idx, 2)); - // Combine the comparison results with AND. - if (!CondReg) // First. - CondReg = NewCondReg; - else { // If not the first, we create an AND. - Register AndReg = MRI.createVirtualRegister(BoolXExecRC); - BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg) + // Combine the comparison results with AND. + if (!CondReg) // First. + CondReg = NewCondReg; + else { // If not the first, we create an AND. + Register AndReg = MRI.createVirtualRegister(BoolXExecRC); + BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg) .addReg(CondReg) .addReg(NewCondReg); - CondReg = AndReg; - } - } // End for loop. + CondReg = AndReg; + } + } // End for loop. - auto SRsrcRC = TRI->getEquivalentSGPRClass(MRI.getRegClass(VRsrc)); - Register SRsrc = MRI.createVirtualRegister(SRsrcRC); + auto SScalarOpRC = + TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp)); + Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC); - // Build scalar Rsrc. - auto Merge = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc); - unsigned Channel = 0; - for (Register Piece : ReadlanePieces) { - Merge.addReg(Piece) - .addImm(TRI->getSubRegFromChannel(Channel++)); - } + // Build scalar ScalarOp. + auto Merge = + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp); + unsigned Channel = 0; + for (Register Piece : ReadlanePieces) { + Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++)); + } - // Update Rsrc operand to use the SGPR Rsrc. - Rsrc.setReg(SRsrc); - Rsrc.setIsKill(); + // Update ScalarOp operand to use the SGPR ScalarOp. + ScalarOp->setReg(SScalarOp); + ScalarOp->setIsKill(); + } + } Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); MRI.setSimpleHint(SaveExec, CondReg); @@ -5694,14 +5858,15 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB); } -// Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register +// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register // with SGPRs by iterating over all unique values across all lanes. // Returns the loop basic block that now contains \p MI. static MachineBasicBlock * -loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, - MachineOperand &Rsrc, MachineDominatorTree *MDT, - MachineBasicBlock::iterator Begin = nullptr, - MachineBasicBlock::iterator End = nullptr) { +loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, + ArrayRef<MachineOperand *> ScalarOps, + MachineDominatorTree *MDT, + MachineBasicBlock::iterator Begin = nullptr, + MachineBasicBlock::iterator End = nullptr) { MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); @@ -5728,11 +5893,8 @@ loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, MachineBasicBlock::iterator AfterMI = MI; ++AfterMI; for (auto I = Begin; I != AfterMI; I++) { - for (auto &MO : I->uses()) { - if (MO.isReg() && MO.isUse()) { - MRI.clearKillFlags(MO.getReg()); - } - } + for (auto &MO : I->all_uses()) + MRI.clearKillFlags(MO.getReg()); } // To insert the loop we need to split the block. Move everything after this @@ -5774,7 +5936,7 @@ loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, } } - emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, Rsrc); + emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps); // Restore the EXEC mask MachineBasicBlock::iterator First = RemainderBB->begin(); @@ -5971,11 +6133,11 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, (isMUBUF(MI) || isMTBUF(MI)))) { MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) - CreatedBB = loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT); + CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT); MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) - CreatedBB = loadSRsrcFromVGPR(*this, MI, *SSamp, MDT); + CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT); return CreatedBB; } @@ -6003,25 +6165,39 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && MI.definesRegister(End->getOperand(1).getReg())) ++End; - CreatedBB = loadSRsrcFromVGPR(*this, MI, *Dest, MDT, Start, End); + CreatedBB = + loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End); + } + } + + // Legalize MUBUF instructions. + bool isSoffsetLegal = true; + int SoffsetIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset); + if (SoffsetIdx != -1) { + MachineOperand *Soffset = &MI.getOperand(SoffsetIdx); + if (Soffset->isReg() && + !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) { + isSoffsetLegal = false; } } - // Legalize MUBUF* instructions. + bool isRsrcLegal = true; int RsrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); if (RsrcIdx != -1) { - // We have an MUBUF instruction MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); - unsigned RsrcRC = get(MI.getOpcode()).operands()[RsrcIdx].RegClass; - if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()), - RI.getRegClass(RsrcRC))) { - // The operands are legal. - // FIXME: We may need to legalize operands besides srsrc. - return CreatedBB; + if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) { + isRsrcLegal = false; } + } - // Legalize a VGPR Rsrc. + // The operands are legal. + if (isRsrcLegal && isSoffsetLegal) + return CreatedBB; + + if (!isRsrcLegal) { + // Legalize a VGPR Rsrc // // If the instruction is _ADDR64, we can avoid a waterfall by extracting // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using @@ -6034,6 +6210,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, // Otherwise we are on non-ADDR64 hardware, and/or we have // idxen/offen/bothen and we fall back to a waterfall loop. + MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); MachineBasicBlock &MBB = *MI.getParent(); MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); @@ -6143,433 +6320,447 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, .addReg(RsrcPtr, 0, AMDGPU::sub1) .addImm(AMDGPU::sub1); } else { - // This is another variant; legalize Rsrc with waterfall loop from VGPRs - // to SGPRs. - CreatedBB = loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT); + // Legalize a VGPR Rsrc and soffset together. + if (!isSoffsetLegal) { + MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset); + CreatedBB = + loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT); + return CreatedBB; + } + CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT); return CreatedBB; } } + + // Legalize a VGPR soffset. + if (!isSoffsetLegal) { + MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset); + CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT); + return CreatedBB; + } return CreatedBB; } -MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, - MachineDominatorTree *MDT) const { - SetVectorType Worklist; - Worklist.insert(&TopInst); - MachineBasicBlock *CreatedBB = nullptr; - MachineBasicBlock *CreatedBBTmp = nullptr; - - while (!Worklist.empty()) { - MachineInstr &Inst = *Worklist.pop_back_val(); - MachineBasicBlock *MBB = Inst.getParent(); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - - unsigned Opcode = Inst.getOpcode(); - unsigned NewOpcode = getVALUOp(Inst); - - // Handle some special cases - switch (Opcode) { - default: - break; - case AMDGPU::S_ADD_U64_PSEUDO: - case AMDGPU::S_SUB_U64_PSEUDO: - splitScalar64BitAddSub(Worklist, Inst, MDT); - Inst.eraseFromParent(); - continue; - case AMDGPU::S_ADD_I32: - case AMDGPU::S_SUB_I32: { - // FIXME: The u32 versions currently selected use the carry. - bool Changed; - std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; - if (Changed) - continue; - - // Default handling - break; - } - case AMDGPU::S_AND_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); - Inst.eraseFromParent(); - continue; +void SIInstrWorklist::insert(MachineInstr *MI) { + InstrList.insert(MI); + // Add MBUF instructiosn to deferred list. + int RsrcIdx = + AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); + if (RsrcIdx != -1) { + DeferredList.insert(MI); + } +} - case AMDGPU::S_OR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); - Inst.eraseFromParent(); - continue; +bool SIInstrWorklist::isDeferred(MachineInstr *MI) { + return DeferredList.contains(MI); +} - case AMDGPU::S_XOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); - Inst.eraseFromParent(); - continue; +void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist, + MachineDominatorTree *MDT) const { - case AMDGPU::S_NAND_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); - Inst.eraseFromParent(); + while (!Worklist.empty()) { + MachineInstr &Inst = *Worklist.top(); + Worklist.erase_top(); + // Skip MachineInstr in the deferred list. + if (Worklist.isDeferred(&Inst)) continue; + moveToVALUImpl(Worklist, MDT, Inst); + } - case AMDGPU::S_NOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); - Inst.eraseFromParent(); - continue; + // Deferred list of instructions will be processed once + // all the MachineInstr in the worklist are done. + for (MachineInstr *Inst : Worklist.getDeferredList()) { + moveToVALUImpl(Worklist, MDT, *Inst); + assert(Worklist.empty() && + "Deferred MachineInstr are not supposed to re-populate worklist"); + } +} - case AMDGPU::S_XNOR_B64: - if (ST.hasDLInsts()) - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); - else - splitScalar64BitXnor(Worklist, Inst, MDT); - Inst.eraseFromParent(); - continue; +void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, + MachineDominatorTree *MDT, + MachineInstr &Inst) const { - case AMDGPU::S_ANDN2_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); - Inst.eraseFromParent(); - continue; + MachineBasicBlock *MBB = Inst.getParent(); + if (!MBB) + return; + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned Opcode = Inst.getOpcode(); + unsigned NewOpcode = getVALUOp(Inst); + // Handle some special cases + switch (Opcode) { + default: + break; + case AMDGPU::S_ADD_U64_PSEUDO: + case AMDGPU::S_SUB_U64_PSEUDO: + splitScalar64BitAddSub(Worklist, Inst, MDT); + Inst.eraseFromParent(); + return; + case AMDGPU::S_ADD_I32: + case AMDGPU::S_SUB_I32: { + // FIXME: The u32 versions currently selected use the carry. + bool Changed; + MachineBasicBlock *CreatedBBTmp = nullptr; + std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT); + if (Changed) + return; - case AMDGPU::S_ORN2_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); - Inst.eraseFromParent(); - continue; + // Default handling + break; + } + case AMDGPU::S_AND_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); + Inst.eraseFromParent(); + return; - case AMDGPU::S_BREV_B64: - splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_OR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); + Inst.eraseFromParent(); + return; - case AMDGPU::S_NOT_B64: - splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_XOR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); + Inst.eraseFromParent(); + return; - case AMDGPU::S_BCNT1_I32_B64: - splitScalar64BitBCNT(Worklist, Inst); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_NAND_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); + Inst.eraseFromParent(); + return; - case AMDGPU::S_BFE_I64: - splitScalar64BitBFE(Worklist, Inst); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_NOR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); + Inst.eraseFromParent(); + return; - case AMDGPU::S_LSHL_B32: - if (ST.hasOnlyRevVALUShifts()) { - NewOpcode = AMDGPU::V_LSHLREV_B32_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_ASHR_I32: - if (ST.hasOnlyRevVALUShifts()) { - NewOpcode = AMDGPU::V_ASHRREV_I32_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_LSHR_B32: - if (ST.hasOnlyRevVALUShifts()) { - NewOpcode = AMDGPU::V_LSHRREV_B32_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_LSHL_B64: - if (ST.hasOnlyRevVALUShifts()) { - NewOpcode = AMDGPU::V_LSHLREV_B64_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_ASHR_I64: - if (ST.hasOnlyRevVALUShifts()) { - NewOpcode = AMDGPU::V_ASHRREV_I64_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_LSHR_B64: - if (ST.hasOnlyRevVALUShifts()) { - NewOpcode = AMDGPU::V_LSHRREV_B64_e64; - swapOperands(Inst); - } - break; + case AMDGPU::S_XNOR_B64: + if (ST.hasDLInsts()) + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); + else + splitScalar64BitXnor(Worklist, Inst, MDT); + Inst.eraseFromParent(); + return; - case AMDGPU::S_ABS_I32: - lowerScalarAbs(Worklist, Inst); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_ANDN2_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); + Inst.eraseFromParent(); + return; - case AMDGPU::S_CBRANCH_SCC0: - case AMDGPU::S_CBRANCH_SCC1: { - // Clear unused bits of vcc - Register CondReg = Inst.getOperand(1).getReg(); - bool IsSCC = CondReg == AMDGPU::SCC; - Register VCC = RI.getVCC(); - Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; - BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC) - .addReg(EXEC) - .addReg(IsSCC ? VCC : CondReg); - Inst.removeOperand(1); - } - break; + case AMDGPU::S_ORN2_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); + Inst.eraseFromParent(); + return; - case AMDGPU::S_BFE_U64: - case AMDGPU::S_BFM_B64: - llvm_unreachable("Moving this op to VALU not implemented"); + case AMDGPU::S_BREV_B64: + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true); + Inst.eraseFromParent(); + return; - case AMDGPU::S_PACK_LL_B32_B16: - case AMDGPU::S_PACK_LH_B32_B16: - case AMDGPU::S_PACK_HL_B32_B16: - case AMDGPU::S_PACK_HH_B32_B16: - movePackToVALU(Worklist, MRI, Inst); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_NOT_B64: + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); + Inst.eraseFromParent(); + return; - case AMDGPU::S_XNOR_B32: - lowerScalarXnor(Worklist, Inst); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_BCNT1_I32_B64: + splitScalar64BitBCNT(Worklist, Inst); + Inst.eraseFromParent(); + return; - case AMDGPU::S_NAND_B32: - splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_BFE_I64: + splitScalar64BitBFE(Worklist, Inst); + Inst.eraseFromParent(); + return; - case AMDGPU::S_NOR_B32: - splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_LSHL_B32: + if (ST.hasOnlyRevVALUShifts()) { + NewOpcode = AMDGPU::V_LSHLREV_B32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_ASHR_I32: + if (ST.hasOnlyRevVALUShifts()) { + NewOpcode = AMDGPU::V_ASHRREV_I32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHR_B32: + if (ST.hasOnlyRevVALUShifts()) { + NewOpcode = AMDGPU::V_LSHRREV_B32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHL_B64: + if (ST.hasOnlyRevVALUShifts()) { + NewOpcode = AMDGPU::V_LSHLREV_B64_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_ASHR_I64: + if (ST.hasOnlyRevVALUShifts()) { + NewOpcode = AMDGPU::V_ASHRREV_I64_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHR_B64: + if (ST.hasOnlyRevVALUShifts()) { + NewOpcode = AMDGPU::V_LSHRREV_B64_e64; + swapOperands(Inst); + } + break; - case AMDGPU::S_ANDN2_B32: - splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_ABS_I32: + lowerScalarAbs(Worklist, Inst); + Inst.eraseFromParent(); + return; - case AMDGPU::S_ORN2_B32: - splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_CBRANCH_SCC0: + case AMDGPU::S_CBRANCH_SCC1: { + // Clear unused bits of vcc + Register CondReg = Inst.getOperand(1).getReg(); + bool IsSCC = CondReg == AMDGPU::SCC; + Register VCC = RI.getVCC(); + Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC) + .addReg(EXEC) + .addReg(IsSCC ? VCC : CondReg); + Inst.removeOperand(1); + } break; - // TODO: remove as soon as everything is ready - // to replace VGPR to SGPR copy with V_READFIRSTLANEs. - // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO - // can only be selected from the uniform SDNode. - case AMDGPU::S_ADD_CO_PSEUDO: - case AMDGPU::S_SUB_CO_PSEUDO: { - unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) - ? AMDGPU::V_ADDC_U32_e64 - : AMDGPU::V_SUBB_U32_e64; - const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + case AMDGPU::S_BFE_U64: + case AMDGPU::S_BFM_B64: + llvm_unreachable("Moving this op to VALU not implemented"); - Register CarryInReg = Inst.getOperand(4).getReg(); - if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { - Register NewCarryReg = MRI.createVirtualRegister(CarryRC); - BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) - .addReg(CarryInReg); - } + case AMDGPU::S_PACK_LL_B32_B16: + case AMDGPU::S_PACK_LH_B32_B16: + case AMDGPU::S_PACK_HL_B32_B16: + case AMDGPU::S_PACK_HH_B32_B16: + movePackToVALU(Worklist, MRI, Inst); + Inst.eraseFromParent(); + return; - Register CarryOutReg = Inst.getOperand(1).getReg(); + case AMDGPU::S_XNOR_B32: + lowerScalarXnor(Worklist, Inst); + Inst.eraseFromParent(); + return; - Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( - MRI.getRegClass(Inst.getOperand(0).getReg()))); - MachineInstr *CarryOp = - BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) - .addReg(CarryOutReg, RegState::Define) - .add(Inst.getOperand(2)) - .add(Inst.getOperand(3)) - .addReg(CarryInReg) - .addImm(0); - CreatedBBTmp = legalizeOperands(*CarryOp); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; - MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); - addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); - Inst.eraseFromParent(); - } - continue; - case AMDGPU::S_UADDO_PSEUDO: - case AMDGPU::S_USUBO_PSEUDO: { - const DebugLoc &DL = Inst.getDebugLoc(); - MachineOperand &Dest0 = Inst.getOperand(0); - MachineOperand &Dest1 = Inst.getOperand(1); - MachineOperand &Src0 = Inst.getOperand(2); - MachineOperand &Src1 = Inst.getOperand(3); + case AMDGPU::S_NAND_B32: + splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); + Inst.eraseFromParent(); + return; - unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) - ? AMDGPU::V_ADD_CO_U32_e64 - : AMDGPU::V_SUB_CO_U32_e64; - const TargetRegisterClass *NewRC = - RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); - Register DestReg = MRI.createVirtualRegister(NewRC); - MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) - .addReg(Dest1.getReg(), RegState::Define) - .add(Src0) - .add(Src1) - .addImm(0); // clamp bit + case AMDGPU::S_NOR_B32: + splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); + Inst.eraseFromParent(); + return; - CreatedBBTmp = legalizeOperands(*NewInstr, MDT); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; + case AMDGPU::S_ANDN2_B32: + splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); + Inst.eraseFromParent(); + return; - MRI.replaceRegWith(Dest0.getReg(), DestReg); - addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, - Worklist); - Inst.eraseFromParent(); - } - continue; + case AMDGPU::S_ORN2_B32: + splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); + Inst.eraseFromParent(); + return; - case AMDGPU::S_CSELECT_B32: - case AMDGPU::S_CSELECT_B64: - lowerSelect(Worklist, Inst, MDT); - Inst.eraseFromParent(); - continue; - case AMDGPU::S_CMP_EQ_I32: - case AMDGPU::S_CMP_LG_I32: - case AMDGPU::S_CMP_GT_I32: - case AMDGPU::S_CMP_GE_I32: - case AMDGPU::S_CMP_LT_I32: - case AMDGPU::S_CMP_LE_I32: - case AMDGPU::S_CMP_EQ_U32: - case AMDGPU::S_CMP_LG_U32: - case AMDGPU::S_CMP_GT_U32: - case AMDGPU::S_CMP_GE_U32: - case AMDGPU::S_CMP_LT_U32: - case AMDGPU::S_CMP_LE_U32: - case AMDGPU::S_CMP_EQ_U64: - case AMDGPU::S_CMP_LG_U64: { - const MCInstrDesc &NewDesc = get(NewOpcode); - Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass()); - MachineInstr *NewInstr = - BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg) - .add(Inst.getOperand(0)) - .add(Inst.getOperand(1)); - legalizeOperands(*NewInstr, MDT); - int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC); - MachineOperand SCCOp = Inst.getOperand(SCCIdx); - addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); - Inst.eraseFromParent(); - } - continue; - } + // TODO: remove as soon as everything is ready + // to replace VGPR to SGPR copy with V_READFIRSTLANEs. + // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO + // can only be selected from the uniform SDNode. + case AMDGPU::S_ADD_CO_PSEUDO: + case AMDGPU::S_SUB_CO_PSEUDO: { + unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) + ? AMDGPU::V_ADDC_U32_e64 + : AMDGPU::V_SUBB_U32_e64; + const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { - // We cannot move this instruction to the VALU, so we should try to - // legalize its operands instead. - CreatedBBTmp = legalizeOperands(Inst, MDT); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; - continue; + Register CarryInReg = Inst.getOperand(4).getReg(); + if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { + Register NewCarryReg = MRI.createVirtualRegister(CarryRC); + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) + .addReg(CarryInReg); } - // Handle converting generic instructions like COPY-to-SGPR into - // COPY-to-VGPR. - if (NewOpcode == Opcode) { - Register DstReg = Inst.getOperand(0).getReg(); - const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); - - if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && - NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { - // Instead of creating a copy where src and dst are the same register - // class, we just replace all uses of dst with src. These kinds of - // copies interfere with the heuristics MachineSink uses to decide - // whether or not to split a critical edge. Since the pass assumes - // that copies will end up as machine instructions and not be - // eliminated. - addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); - MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); - MRI.clearKillFlags(Inst.getOperand(1).getReg()); - Inst.getOperand(0).setReg(DstReg); - - // Make sure we don't leave around a dead VGPR->SGPR copy. Normally - // these are deleted later, but at -O0 it would leave a suspicious - // looking illegal copy of an undef register. - for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) - Inst.removeOperand(I); - Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); - continue; - } - - Register NewDstReg = MRI.createVirtualRegister(NewDstRC); - MRI.replaceRegWith(DstReg, NewDstReg); - legalizeOperands(Inst, MDT); - addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); - continue; - } + Register CarryOutReg = Inst.getOperand(1).getReg(); - // Use the new VALU Opcode. - auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode)) - .setMIFlags(Inst.getFlags()); - for (const MachineOperand &Op : Inst.explicit_operands()) - NewInstr->addOperand(Op); + Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( + MRI.getRegClass(Inst.getOperand(0).getReg()))); + MachineInstr *CarryOp = + BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) + .addReg(CarryOutReg, RegState::Define) + .add(Inst.getOperand(2)) + .add(Inst.getOperand(3)) + .addReg(CarryInReg) + .addImm(0); + legalizeOperands(*CarryOp); + MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); + addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); + Inst.eraseFromParent(); + } + return; + case AMDGPU::S_UADDO_PSEUDO: + case AMDGPU::S_USUBO_PSEUDO: { + const DebugLoc &DL = Inst.getDebugLoc(); + MachineOperand &Dest0 = Inst.getOperand(0); + MachineOperand &Dest1 = Inst.getOperand(1); + MachineOperand &Src0 = Inst.getOperand(2); + MachineOperand &Src1 = Inst.getOperand(3); - // Remove any references to SCC. Vector instructions can't read from it, and - // We're just about to add the implicit use / defs of VCC, and we don't want - // both. - for (MachineOperand &Op : Inst.implicit_operands()) { - if (Op.getReg() == AMDGPU::SCC) { - // Only propagate through live-def of SCC. - if (Op.isDef() && !Op.isDead()) - addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); - if (Op.isUse()) - addSCCDefsToVALUWorklist(NewInstr, Worklist); - } - } + unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) + ? AMDGPU::V_ADD_CO_U32_e64 + : AMDGPU::V_SUB_CO_U32_e64; + const TargetRegisterClass *NewRC = + RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); + Register DestReg = MRI.createVirtualRegister(NewRC); + MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) + .addReg(Dest1.getReg(), RegState::Define) + .add(Src0) + .add(Src1) + .addImm(0); // clamp bit + legalizeOperands(*NewInstr, MDT); + MRI.replaceRegWith(Dest0.getReg(), DestReg); + addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, + Worklist); Inst.eraseFromParent(); + } + return; - Register NewDstReg; - if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) { - Register DstReg = NewInstr->getOperand(0).getReg(); - assert(DstReg.isVirtual()); - - // Update the destination register class. - const TargetRegisterClass *NewDstRC = - getDestEquivalentVGPRClass(*NewInstr); - assert(NewDstRC); + case AMDGPU::S_CSELECT_B32: + case AMDGPU::S_CSELECT_B64: + lowerSelect(Worklist, Inst, MDT); + Inst.eraseFromParent(); + return; + case AMDGPU::S_CMP_EQ_I32: + case AMDGPU::S_CMP_LG_I32: + case AMDGPU::S_CMP_GT_I32: + case AMDGPU::S_CMP_GE_I32: + case AMDGPU::S_CMP_LT_I32: + case AMDGPU::S_CMP_LE_I32: + case AMDGPU::S_CMP_EQ_U32: + case AMDGPU::S_CMP_LG_U32: + case AMDGPU::S_CMP_GT_U32: + case AMDGPU::S_CMP_GE_U32: + case AMDGPU::S_CMP_LT_U32: + case AMDGPU::S_CMP_LE_U32: + case AMDGPU::S_CMP_EQ_U64: + case AMDGPU::S_CMP_LG_U64: { + const MCInstrDesc &NewDesc = get(NewOpcode); + Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass()); + MachineInstr *NewInstr = + BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg) + .add(Inst.getOperand(0)) + .add(Inst.getOperand(1)); + legalizeOperands(*NewInstr, MDT); + int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC); + MachineOperand SCCOp = Inst.getOperand(SCCIdx); + addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); + Inst.eraseFromParent(); + } + return; + } - NewDstReg = MRI.createVirtualRegister(NewDstRC); - MRI.replaceRegWith(DstReg, NewDstReg); - } + if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { + // We cannot move this instruction to the VALU, so we should try to + // legalize its operands instead. + legalizeOperands(Inst, MDT); + return; + } + // Handle converting generic instructions like COPY-to-SGPR into + // COPY-to-VGPR. + if (NewOpcode == Opcode) { + Register DstReg = Inst.getOperand(0).getReg(); + const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); - if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { - // We are converting these to a BFE, so we need to add the missing - // operands for the size and offset. - unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; - NewInstr.addImm(0); - NewInstr.addImm(Size); - } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { - // The VALU version adds the second operand to the result, so insert an - // extra 0 operand. - NewInstr.addImm(0); + if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && + NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { + // Instead of creating a copy where src and dst are the same register + // class, we just replace all uses of dst with src. These kinds of + // copies interfere with the heuristics MachineSink uses to decide + // whether or not to split a critical edge. Since the pass assumes + // that copies will end up as machine instructions and not be + // eliminated. + addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); + MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); + MRI.clearKillFlags(Inst.getOperand(1).getReg()); + Inst.getOperand(0).setReg(DstReg); + // Make sure we don't leave around a dead VGPR->SGPR copy. Normally + // these are deleted later, but at -O0 it would leave a suspicious + // looking illegal copy of an undef register. + for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) + Inst.removeOperand(I); + Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); + return; } + Register NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + legalizeOperands(Inst, MDT); + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); + return; + } - if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { - const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2); - // If we need to move this to VGPRs, we need to unpack the second operand - // back into the 2 separate ones for bit offset and width. - assert(OffsetWidthOp.isImm() && - "Scalar BFE is only implemented for constant width and offset"); - uint32_t Imm = OffsetWidthOp.getImm(); - - uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. - uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. - NewInstr->removeOperand(2); - NewInstr.addImm(Offset); - NewInstr.addImm(BitWidth); + // Use the new VALU Opcode. + auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode)) + .setMIFlags(Inst.getFlags()); + for (const MachineOperand &Op : Inst.explicit_operands()) + NewInstr->addOperand(Op); + // Remove any references to SCC. Vector instructions can't read from it, and + // We're just about to add the implicit use / defs of VCC, and we don't want + // both. + for (MachineOperand &Op : Inst.implicit_operands()) { + if (Op.getReg() == AMDGPU::SCC) { + // Only propagate through live-def of SCC. + if (Op.isDef() && !Op.isDead()) + addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); + if (Op.isUse()) + addSCCDefsToVALUWorklist(NewInstr, Worklist); } - - fixImplicitOperands(*NewInstr); - - // Legalize the operands - CreatedBBTmp = legalizeOperands(*NewInstr, MDT); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; - - if (NewDstReg) - addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); } - return CreatedBB; + Inst.eraseFromParent(); + Register NewDstReg; + if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) { + Register DstReg = NewInstr->getOperand(0).getReg(); + assert(DstReg.isVirtual()); + // Update the destination register class. + const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr); + assert(NewDstRC); + NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + } + if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { + // We are converting these to a BFE, so we need to add the missing + // operands for the size and offset. + unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; + NewInstr.addImm(0); + NewInstr.addImm(Size); + } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { + // The VALU version adds the second operand to the result, so insert an + // extra 0 operand. + NewInstr.addImm(0); + } + if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { + const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2); + // If we need to move this to VGPRs, we need to unpack the second operand + // back into the 2 separate ones for bit offset and width. + assert(OffsetWidthOp.isImm() && + "Scalar BFE is only implemented for constant width and offset"); + uint32_t Imm = OffsetWidthOp.getImm(); + uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. + uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. + NewInstr->removeOperand(2); + NewInstr.addImm(Offset); + NewInstr.addImm(BitWidth); + } + fixImplicitOperands(*NewInstr); + // Legalize the operands + legalizeOperands(*NewInstr, MDT); + if (NewDstReg) + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); } // Add/sub require special handling to deal with carry outs. std::pair<bool, MachineBasicBlock *> -SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, +SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT) const { if (ST.hasAddNoCarry()) { // Assume there is no user of scc since we don't select this in that case. @@ -6604,7 +6795,7 @@ SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, return std::pair(false, nullptr); } -void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, +void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -6680,7 +6871,7 @@ void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, +void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -6707,7 +6898,7 @@ void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, +void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -6772,7 +6963,7 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, } } -void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, +void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -6801,7 +6992,7 @@ void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); } -void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, +void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -6830,9 +7021,9 @@ void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitUnaryOp( - SetVectorType &Worklist, MachineInstr &Inst, - unsigned Opcode, bool Swap) const { +void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist, + MachineInstr &Inst, unsigned Opcode, + bool Swap) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -6889,7 +7080,7 @@ void SIInstrInfo::splitScalar64BitUnaryOp( addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, +void SIInstrInfo::splitScalar64BitAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT) const { bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); @@ -6963,7 +7154,7 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, +void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode, MachineDominatorTree *MDT) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -7030,7 +7221,7 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, +void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -7072,8 +7263,8 @@ void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, Worklist.insert(&Xor); } -void SIInstrInfo::splitScalar64BitBCNT( - SetVectorType &Worklist, MachineInstr &Inst) const { +void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist, + MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -7110,7 +7301,7 @@ void SIInstrInfo::splitScalar64BitBCNT( addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, +void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -7172,9 +7363,8 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, } void SIInstrInfo::addUsersToMoveToVALUWorklist( - Register DstReg, - MachineRegisterInfo &MRI, - SetVectorType &Worklist) const { + Register DstReg, MachineRegisterInfo &MRI, + SIInstrWorklist &Worklist) const { for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), E = MRI.use_end(); I != E;) { MachineInstr &UseMI = *I->getParent(); @@ -7208,7 +7398,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( } } -void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, +void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI, MachineInstr &Inst) const { Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -7283,7 +7473,7 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, MachineInstr &SCCDefInst, - SetVectorType &Worklist, + SIInstrWorklist &Worklist, Register NewCond) const { // Ensure that def inst defines SCC, which is still live. @@ -7326,7 +7516,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, // sure that the instruction that defines SCC is added to the moveToVALU // worklist. void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst, - SetVectorType &Worklist) const { + SIInstrWorklist &Worklist) const { // Look for a preceding instruction that either defines VCC or SCC. If VCC // then there is nothing to do because the defining instruction has been // converted to a VALU already. If SCC then that instruction needs to be @@ -7811,6 +8001,16 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const { return ArrayRef(TargetFlags); } +unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg, + const MachineFunction &MF) const { + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + assert(SrcReg.isVirtual()); + if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG)) + return AMDGPU::WWM_COPY; + + return AMDGPU::COPY; +} + bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && MI.modifiesRegister(AMDGPU::EXEC, &RI); @@ -7843,7 +8043,9 @@ MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, // If available, prefer to use vcc. Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC) ? Register(RI.getVCC()) - : RS.scavengeRegister(RI.getBoolRC(), I, 0, false); + : RS.scavengeRegisterBackwards( + *RI.getBoolRC(), I, /* RestoreAfter */ false, + 0, /* AllowSpill */ false); // TODO: Users need to deal with this. if (!UnusedCarry.isValid()) @@ -7874,10 +8076,15 @@ const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) con } } +unsigned SIInstrInfo::getMaxMUBUFImmOffset() { return (1 << 12) - 1; } + void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const { if (!ST.isWave32()) return; + if (MI.isInlineAsm()) + return; + for (auto &Op : MI.implicit_operands()) { if (Op.isReg() && Op.getReg() == AMDGPU::VCC) Op.setReg(AMDGPU::VCC_LO); @@ -7897,6 +8104,52 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); } +// Given Imm, split it into the values to put into the SOffset and ImmOffset +// fields in an MUBUF instruction. Return false if it is not possible (due to a +// hardware bug needing a workaround). +// +// The required alignment ensures that individual address components remain +// aligned if they are aligned to begin with. It also ensures that additional +// offsets within the given alignment can be added to the resulting ImmOffset. +bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, + uint32_t &ImmOffset, Align Alignment) const { + const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(); + const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value()); + uint32_t Overflow = 0; + + if (Imm > MaxImm) { + if (Imm <= MaxImm + 64) { + // Use an SOffset inline constant for 4..64 + Overflow = Imm - MaxImm; + Imm = MaxImm; + } else { + // Try to keep the same value in SOffset for adjacent loads, so that + // the corresponding register contents can be re-used. + // + // Load values with all low-bits (except for alignment bits) set into + // SOffset, so that a larger range of values can be covered using + // s_movk_i32. + // + // Atomic operations fail to work correctly when individual address + // components are unaligned, even if their sum is aligned. + uint32_t High = (Imm + Alignment.value()) & ~MaxOffset; + uint32_t Low = (Imm + Alignment.value()) & MaxOffset; + Imm = Low; + Overflow = High - Alignment.value(); + } + } + + // There is a hardware bug in SI and CI which prevents address clamping in + // MUBUF instructions from working correctly with SOffsets. The immediate + // offset is unaffected. + if (Overflow > 0 && ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) + return false; + + ImmOffset = Imm; + SOffset = Overflow; + return true; +} + // Depending on the used address space and instructions, some immediate offsets // are allowed and some are not. // In general, flat instruction offsets can only be non-negative, global and @@ -7987,23 +8240,7 @@ SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, return {ImmField, RemainderOffset}; } -// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td -// and the columns of the getMCOpcodeGen table. -enum SIEncodingFamily { - SI = 0, - VI = 1, - SDWA = 2, - SDWA9 = 3, - GFX80 = 4, - GFX9 = 5, - GFX10 = 6, - SDWA10 = 7, - GFX90A = 8, - GFX940 = 9, - GFX11 = 10, -}; - -static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { +static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) { switch (ST.getGeneration()) { default: break; @@ -8042,7 +8279,7 @@ bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const { } int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { - SIEncodingFamily Gen = subtargetEncodingFamily(ST); + unsigned Gen = subtargetEncodingFamily(ST); if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && ST.getGeneration() == AMDGPUSubtarget::GFX9) @@ -8325,7 +8562,7 @@ MachineInstr *SIInstrInfo::foldMemoryOperandImpl( // A similar issue also exists with spilling and reloading $exec registers. // // To prevent that, constrain the %0 register class here. - if (MI.isFullCopy()) { + if (isFullCopyInstr(MI)) { Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); if ((DstReg.isVirtual() || SrcReg.isVirtual()) && @@ -8368,9 +8605,20 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { unsigned opcode = MI.getOpcode(); if (opcode == AMDGPU::G_INTRINSIC || opcode == AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS) { - return AMDGPU::isIntrinsicSourceOfDivergence(MI.getIntrinsicID()) - ? InstructionUniformity::NeverUniform - : InstructionUniformity::AlwaysUniform; + auto IID = static_cast<Intrinsic::ID>(MI.getIntrinsicID()); + if (AMDGPU::isIntrinsicSourceOfDivergence(IID)) + return InstructionUniformity::NeverUniform; + if (AMDGPU::isIntrinsicAlwaysUniform(IID)) + return InstructionUniformity::AlwaysUniform; + + switch (IID) { + case Intrinsic::amdgcn_if: + case Intrinsic::amdgcn_else: + // FIXME: Uniform if second result + break; + } + + return InstructionUniformity::Default; } // Loads from the private and flat address spaces are divergent, because @@ -8403,6 +8651,29 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { InstructionUniformity SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const { + + if (isNeverUniform(MI)) + return InstructionUniformity::NeverUniform; + + unsigned opcode = MI.getOpcode(); + if (opcode == AMDGPU::V_READLANE_B32 || opcode == AMDGPU::V_READFIRSTLANE_B32) + return InstructionUniformity::AlwaysUniform; + + if (isCopyInstr(MI)) { + const MachineOperand &srcOp = MI.getOperand(1); + if (srcOp.isReg() && srcOp.getReg().isPhysical()) { + const TargetRegisterClass *regClass = + RI.getPhysRegBaseClass(srcOp.getReg()); + return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform + : InstructionUniformity::NeverUniform; + } + return InstructionUniformity::Default; + } + + // GMIR handling + if (MI.isPreISelOpcode()) + return SIInstrInfo::getGenericInstructionUniformity(MI); + // Atomics are divergent because they are executed sequentially: when an // atomic operation refers to the same address in each thread, then each // thread after the first sees the value written by the previous thread as @@ -8429,44 +8700,26 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const { return InstructionUniformity::Default; } - unsigned opcode = MI.getOpcode(); - if (opcode == AMDGPU::COPY) { - const MachineOperand &srcOp = MI.getOperand(1); - if (srcOp.isReg() && srcOp.getReg().isPhysical()) { - const TargetRegisterClass *regClass = RI.getPhysRegBaseClass(srcOp.getReg()); - return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform - : InstructionUniformity::NeverUniform; - } - return InstructionUniformity::Default; - } - if (opcode == AMDGPU::INLINEASM || opcode == AMDGPU::INLINEASM_BR) { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); - for (auto &op : MI.operands()) { - if (!op.isReg() || !op.isDef()) - continue; - auto *RC = MRI.getRegClass(op.getReg()); - if (!RC || RI.isDivergentRegClass(RC)) - return InstructionUniformity::NeverUniform; - } - return InstructionUniformity::AlwaysUniform; - } - if (opcode == AMDGPU::V_READLANE_B32 || opcode == AMDGPU::V_READFIRSTLANE_B32) - return InstructionUniformity::AlwaysUniform; + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo(); - if (opcode == AMDGPU::V_WRITELANE_B32) - return InstructionUniformity::NeverUniform; + // FIXME: It's conceptually broken to report this for an instruction, and not + // a specific def operand. For inline asm in particular, there could be mixed + // uniform and divergent results. + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { + const MachineOperand &SrcOp = MI.getOperand(I); + if (!SrcOp.isReg()) + continue; - // GMIR handling - if (SIInstrInfo::isGenericOpcode(opcode)) - return SIInstrInfo::getGenericInstructionUniformity(MI); + Register Reg = SrcOp.getReg(); + if (!Reg || !SrcOp.readsReg()) + continue; - // Handling $vpgr reads - for (auto srcOp : MI.operands()) { - if (srcOp.isReg() && srcOp.getReg().isPhysical()) { - const TargetRegisterClass *regClass = RI.getPhysRegBaseClass(srcOp.getReg()); - if (RI.isVGPRClass(regClass)) - return InstructionUniformity::NeverUniform; - } + // If RegBank is null, this is unassigned or an unallocatable special + // register, which are all scalars. + const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI); + if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID) + return InstructionUniformity::NeverUniform; } // TODO: Uniformity check condtions above can be rearranged for more @@ -8622,7 +8875,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, else return false; - unsigned BitNo = countTrailingZeros((uint64_t)Mask); + unsigned BitNo = llvm::countr_zero((uint64_t)Mask); if (IsSigned && BitNo == SrcSize - 1) return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 025faec0e2cc..b25aae7b2fb0 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -41,6 +41,41 @@ class ScheduleHazardRecognizer; static const MachineMemOperand::Flags MONoClobber = MachineMemOperand::MOTargetFlag1; +/// Utility to store machine instructions worklist. +struct SIInstrWorklist { + SIInstrWorklist() : InstrList() {} + + void insert(MachineInstr *MI); + + MachineInstr *top() const { + auto iter = InstrList.begin(); + return *iter; + } + + void erase_top() { + auto iter = InstrList.begin(); + InstrList.erase(iter); + } + + bool empty() const { return InstrList.empty(); } + + void clear() { + InstrList.clear(); + DeferredList.clear(); + } + + bool isDeferred(MachineInstr *MI); + + SetVector<MachineInstr *> &getDeferredList() { return DeferredList; } + +private: + /// InstrList contains the MachineInstrs. + SetVector<MachineInstr *> InstrList; + /// Deferred instructions are specific MachineInstr + /// that will be added by insert method. + SetVector<MachineInstr *> DeferredList; +}; + class SIInstrInfo final : public AMDGPUGenInstrInfo { private: const SIRegisterInfo RI; @@ -81,57 +116,50 @@ private: void swapOperands(MachineInstr &Inst) const; std::pair<bool, MachineBasicBlock *> - moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, + moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; - void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, + void lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; - void lowerScalarAbs(SetVectorType &Worklist, - MachineInstr &Inst) const; + void lowerScalarAbs(SIInstrWorklist &Worklist, MachineInstr &Inst) const; - void lowerScalarXnor(SetVectorType &Worklist, - MachineInstr &Inst) const; + void lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const; - void splitScalarNotBinop(SetVectorType &Worklist, - MachineInstr &Inst, + void splitScalarNotBinop(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode) const; - void splitScalarBinOpN2(SetVectorType &Worklist, - MachineInstr &Inst, + void splitScalarBinOpN2(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode) const; - void splitScalar64BitUnaryOp(SetVectorType &Worklist, - MachineInstr &Inst, unsigned Opcode, - bool Swap = false) const; + void splitScalar64BitUnaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst, + unsigned Opcode, bool Swap = false) const; - void splitScalar64BitAddSub(SetVectorType &Worklist, MachineInstr &Inst, + void splitScalar64BitAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; - void splitScalar64BitBinaryOp(SetVectorType &Worklist, MachineInstr &Inst, + void splitScalar64BitBinaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode, MachineDominatorTree *MDT = nullptr) const; - void splitScalar64BitXnor(SetVectorType &Worklist, MachineInstr &Inst, - MachineDominatorTree *MDT = nullptr) const; + void splitScalar64BitXnor(SIInstrWorklist &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT = nullptr) const; - void splitScalar64BitBCNT(SetVectorType &Worklist, + void splitScalar64BitBCNT(SIInstrWorklist &Worklist, MachineInstr &Inst) const; - void splitScalar64BitBFE(SetVectorType &Worklist, - MachineInstr &Inst) const; - void movePackToVALU(SetVectorType &Worklist, - MachineRegisterInfo &MRI, + void splitScalar64BitBFE(SIInstrWorklist &Worklist, MachineInstr &Inst) const; + void movePackToVALU(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI, MachineInstr &Inst) const; void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI, - SetVectorType &Worklist) const; + SIInstrWorklist &Worklist) const; void addSCCDefUsersToVALUWorklist(MachineOperand &Op, MachineInstr &SCCDefInst, - SetVectorType &Worklist, + SIInstrWorklist &Worklist, Register NewCond = Register()) const; void addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst, - SetVectorType &Worklist) const; + SIInstrWorklist &Worklist) const; const TargetRegisterClass * getDestEquivalentVGPRClass(const MachineInstr &Inst) const; @@ -142,6 +170,12 @@ private: Register findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const; protected: + /// If the specific machine instruction is a instruction that moves/copies + /// value from one register to another register return destination and source + /// registers as machine operands. + std::optional<DestSourcePair> + isCopyInstrImpl(const MachineInstr &MI) const override; + bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const; @@ -626,6 +660,11 @@ public: return get(Opcode).TSFlags & SIInstrFlags::SGPRSpill; } + static bool isWWMRegSpillOpcode(uint16_t Opcode) { + return Opcode == AMDGPU::SI_SPILL_WWM_V32_SAVE || + Opcode == AMDGPU::SI_SPILL_WWM_V32_RESTORE; + } + static bool isDPP(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::DPP; } @@ -781,6 +820,10 @@ public: return get(Opcode).TSFlags & SIInstrFlags::FPAtomic; } + static bool isNeverUniform(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::IsNeverUniform; + } + static bool doesNotReadTiedSource(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::TiedSourceNotRead; } @@ -790,7 +833,7 @@ public: } bool isVGPRCopy(const MachineInstr &MI) const { - assert(MI.isCopy()); + assert(isCopyInstr(MI)); Register Dest = MI.getOperand(0).getReg(); const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -841,7 +884,7 @@ public: const MachineOperand &UseMO, const MachineOperand &DefMO) const { assert(UseMO.getParent() == &MI); - int OpIdx = MI.getOperandNo(&UseMO); + int OpIdx = UseMO.getOperandNo(); if (OpIdx >= MI.getDesc().NumOperands) return false; @@ -860,7 +903,7 @@ public: if (OpIdx >= MI.getDesc().NumOperands) return false; - if (MI.isCopy()) { + if (isCopyInstr(MI)) { unsigned Size = getOpSize(MI, OpIdx); assert(Size == 8 || Size == 4); @@ -873,8 +916,7 @@ public: } bool isInlineConstant(const MachineOperand &MO) const { - const MachineInstr *Parent = MO.getParent(); - return isInlineConstant(*Parent, Parent->getOperandNo(&MO)); + return isInlineConstant(*MO.getParent(), MO.getOperandNo()); } bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, @@ -908,6 +950,15 @@ public: unsigned getVALUOp(const MachineInstr &MI) const; + void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, Register Reg, bool IsSCCLive, + SlotIndexes *Indexes = nullptr) const; + + void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, + Register Reg, SlotIndexes *Indexes = nullptr) const; + /// Return the correct register class for \p OpNo. For target-specific /// instructions, this will return the register class that has been defined /// in tablegen. For generic instructions, like REG_SEQUENCE it will return @@ -1005,11 +1056,14 @@ public: /// was moved to VGPR. \returns true if succeeded. bool moveFlatAddrToVGPR(MachineInstr &Inst) const; - /// Replace this instruction's opcode with the equivalent VALU - /// opcode. This function will also move the users of \p MI to the - /// VALU if necessary. If present, \p MDT is updated. - MachineBasicBlock *moveToVALU(MachineInstr &MI, - MachineDominatorTree *MDT = nullptr) const; + /// Replace the instructions opcode with the equivalent VALU + /// opcode. This function will also move the users of MachineInstruntions + /// in the \p WorkList to the VALU if necessary. If present, \p MDT is + /// updated. + void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const; + + void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, + MachineInstr &Inst) const; void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; @@ -1095,6 +1149,9 @@ public: CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override; + unsigned getLiveRangeSplitOpcode(Register Reg, + const MachineFunction &MF) const override; + bool isBasicBlockPrologue(const MachineInstr &MI) const override; MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB, @@ -1132,6 +1189,11 @@ public: return isUInt<12>(Imm); } + static unsigned getMaxMUBUFImmOffset(); + + bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, + Align Alignment = Align(4)) const; + /// Returns if \p Offset is legal for the subtarget as the offset to a FLAT /// encoded instruction. If \p Signed, this is for an instruction that /// interprets the offset as signed. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 2066abb0268d..044bc4507d3a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -50,14 +50,6 @@ def SIds_ordered_count : SDNode<"AMDGPUISD::DS_ORDERED_COUNT", [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain, SDNPInGlue] >; -def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2, - [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] ->; - -def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2, - [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] ->; - def SDTAtomic2_f32 : SDTypeProfile<1, 2, [ SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1> ]>; @@ -355,8 +347,6 @@ class isPackedType<ValueType SrcVT> { // PatFrags for global memory operations //===----------------------------------------------------------------------===// -defm atomic_inc : binary_atomic_op_all_as<SIatomic_inc>; -defm atomic_dec : binary_atomic_op_all_as<SIatomic_dec>; defm atomic_load_fmin : binary_atomic_op_all_as<SIatomic_fmin, 0>; defm atomic_load_fmax : binary_atomic_op_all_as<SIatomic_fmax, 0>; @@ -762,8 +752,8 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0, defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; -defm atomic_inc : SIAtomicM0Glue2 <"INC", 1>; -defm atomic_dec : SIAtomicM0Glue2 <"DEC", 1>; +defm atomic_load_uinc_wrap : SIAtomicM0Glue2 <"LOAD_UINC_WRAP">; +defm atomic_load_udec_wrap : SIAtomicM0Glue2 <"LOAD_UDEC_WRAP">; defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; defm atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">; defm atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">; @@ -931,144 +921,39 @@ def set_glc : SDNodeXForm<timm, [{ // Custom Operands //===----------------------------------------------------------------------===// -def SoppBrTarget : AsmOperandClass { - let Name = "SoppBrTarget"; - let ParserMethod = "parseSOppBrTarget"; -} - -def sopp_brtarget : Operand<OtherVT> { +def SOPPBrTarget : CustomOperand<OtherVT> { + let PrintMethod = "printOperand"; let EncoderMethod = "getSOPPBrEncoding"; - let DecoderMethod = "decodeSoppBrTarget"; + let DecoderMethod = "decodeSOPPBrTarget"; let OperandType = "OPERAND_PCREL"; - let ParserMatchClass = SoppBrTarget; } def si_ga : Operand<iPTR>; -def InterpSlotMatchClass : AsmOperandClass { - let Name = "InterpSlot"; - let PredicateMethod = "isInterpSlot"; - let ParserMethod = "parseInterpSlot"; - let RenderMethod = "addImmOperands"; -} - -def InterpSlot : Operand<i32> { - let PrintMethod = "printInterpSlot"; - let ParserMatchClass = InterpSlotMatchClass; - let OperandType = "OPERAND_IMMEDIATE"; -} - -def AttrMatchClass : AsmOperandClass { - let Name = "Attr"; - let PredicateMethod = "isInterpAttr"; - let ParserMethod = "parseInterpAttr"; - let RenderMethod = "addImmOperands"; -} +def InterpSlot : CustomOperand<i32>; // It appears to be necessary to create a separate operand for this to // be able to parse attr<num> with no space. -def Attr : Operand<i32> { - let PrintMethod = "printInterpAttr"; - let ParserMatchClass = AttrMatchClass; - let OperandType = "OPERAND_IMMEDIATE"; -} - -def AttrChanMatchClass : AsmOperandClass { - let Name = "AttrChan"; - let PredicateMethod = "isAttrChan"; - let RenderMethod = "addImmOperands"; -} - -def AttrChan : Operand<i32> { - let PrintMethod = "printInterpAttrChan"; - let ParserMatchClass = AttrChanMatchClass; - let OperandType = "OPERAND_IMMEDIATE"; -} +def InterpAttr : CustomOperand<i32>; -def SendMsgMatchClass : AsmOperandClass { - let Name = "SendMsg"; - let PredicateMethod = "isSendMsg"; - let ParserMethod = "parseSendMsgOp"; - let RenderMethod = "addImmOperands"; -} - -def SwizzleMatchClass : AsmOperandClass { - let Name = "Swizzle"; - let PredicateMethod = "isSwizzle"; - let ParserMethod = "parseSwizzleOp"; - let RenderMethod = "addImmOperands"; - let IsOptional = 1; -} - -def EndpgmMatchClass : AsmOperandClass { - let Name = "EndpgmImm"; - let PredicateMethod = "isEndpgm"; - let ParserMethod = "parseEndpgmOp"; - let RenderMethod = "addImmOperands"; - let IsOptional = 1; -} - -def ExpTgtMatchClass : AsmOperandClass { - let Name = "ExpTgt"; - let PredicateMethod = "isExpTgt"; - let ParserMethod = "parseExpTgt"; - let RenderMethod = "printExpTgt"; -} - -def SWaitMatchClass : AsmOperandClass { - let Name = "SWaitCnt"; - let RenderMethod = "addImmOperands"; - let ParserMethod = "parseSWaitCntOps"; -} - -def DepCtrMatchClass : AsmOperandClass { - let Name = "DepCtr"; - let RenderMethod = "addImmOperands"; - let ParserMethod = "parseDepCtrOps"; -} - -def SDelayMatchClass : AsmOperandClass { - let Name = "SDelayAlu"; - let RenderMethod = "addImmOperands"; - let ParserMethod = "parseSDelayAluOps"; -} +def InterpAttrChan : ImmOperand<i32>; def VReg32OrOffClass : AsmOperandClass { let Name = "VReg32OrOff"; let ParserMethod = "parseVReg32OrOff"; } -let OperandType = "OPERAND_IMMEDIATE" in { -def SendMsgImm : Operand<i32> { - let PrintMethod = "printSendMsg"; - let ParserMatchClass = SendMsgMatchClass; -} +def SendMsg : CustomOperand<i32>; -def SwizzleImm : Operand<i16> { - let PrintMethod = "printSwizzle"; - let ParserMatchClass = SwizzleMatchClass; -} +def Swizzle : CustomOperand<i16, 1>; -def EndpgmImm : Operand<i16> { - let PrintMethod = "printEndpgm"; - let ParserMatchClass = EndpgmMatchClass; -} +def Endpgm : CustomOperand<i16, 1>; -def WAIT_FLAG : Operand <i32> { - let ParserMatchClass = SWaitMatchClass; - let PrintMethod = "printWaitFlag"; -} +def SWaitCnt : CustomOperand<i32>; -def DepCtrImm : Operand <i32> { - let ParserMatchClass = DepCtrMatchClass; - let PrintMethod = "printDepCtr"; -} +def DepCtr : CustomOperand<i32>; -def DELAY_FLAG : Operand <i32> { - let ParserMatchClass = SDelayMatchClass; - let PrintMethod = "printDelayFlag"; -} -} // End OperandType = "OPERAND_IMMEDIATE" +def SDelayALU : CustomOperand<i32>; include "SIInstrFormats.td" include "VIInstrFormats.td" @@ -1148,111 +1033,71 @@ def SDWAVopcDst : BoolRC { let PrintMethod = "printVOPDst"; } -class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass { - let Name = "Imm"#CName; - let PredicateMethod = "is"#CName; - let ParserMethod = !if(Optional, "", "parse"#CName); - let RenderMethod = "addImmOperands"; - let IsOptional = Optional; - let DefaultMethod = !if(Optional, "default"#CName, ?); -} - -class CustomOperandClass<string CName, bit Optional> : AsmOperandClass { - let Name = CName; - let PredicateMethod = "is"#CName; - let ParserMethod = "parse"#CName; - let RenderMethod = "addImmOperands"; - let IsOptional = Optional; - let DefaultMethod = "default"#CName; -} - -class CustomOperandProps<bit Optional = 0, string Name = NAME, - AsmOperandClass Class = CustomOperandClass<Name, Optional>> { - string PrintMethod = "print"#Name; - AsmOperandClass ParserMatchClass = Class; -} - -class CustomOperand<ValueType Type, bit Optional = 0, string Name = NAME, - AsmOperandClass Class = CustomOperandClass<Name, Optional>> - : Operand<Type>, CustomOperandProps<Optional, Name, Class>; - -class NamedIntOperandClass<string Prefix, string Name, string ConvertMethod> - : CustomOperandClass<Name, 1> { - string ImmTy = "AMDGPUOperand::ImmTy"#Name; - let ParserMethod = - "[this](OperandVector &Operands) -> OperandMatchResultTy { "# - "return parseIntWithPrefix(\""#Prefix#"\", Operands, "#ImmTy#", "# - ConvertMethod#"); }"; -} - class NamedIntOperand<ValueType Type, string Prefix, string Name = NAME, string ConvertMethod = "nullptr"> - : CustomOperand<Type, 1, Name, NamedIntOperandClass<Prefix, Name, ConvertMethod>>; - -class BitOperandClass<string Id, string Name> - : CustomOperandClass<Name, 1> { - string ImmTy = "AMDGPUOperand::ImmTy"#Name; + : CustomOperand<Type, 1, Name> { let ParserMethod = - "[this](OperandVector &Operands) -> OperandMatchResultTy { "# - "return parseNamedBit(\""#Id#"\", Operands, "#ImmTy#"); }"; + "[this](OperandVector &Operands) -> ParseStatus { "# + "return parseIntWithPrefix(\""#Prefix#"\", Operands, "# + "AMDGPUOperand::"#ImmTy#", "#ConvertMethod#"); }"; } class NamedBitOperand<string Id, string Name = NAME> - : CustomOperand<i1, 1, Name, BitOperandClass<Id, Name>>; - -class DefaultOperand_0<CustomOperand Op> - : OperandWithDefaultOps<Op.Type, (ops (Op.Type 0))>, - CustomOperandProps<1, Op.ParserMatchClass.Name, Op.ParserMatchClass>; - -class NamedOperandU32<string Name, AsmOperandClass MatchClass> : Operand<i32> { - let PrintMethod = "print"#Name; - let ParserMatchClass = MatchClass; + : CustomOperand<i1, 1, Name> { + let ParserMethod = + "[this](OperandVector &Operands) -> ParseStatus { "# + "return parseNamedBit(\""#Id#"\", Operands, AMDGPUOperand::"#ImmTy#"); }"; + let PrintMethod = "[this](const MCInst *MI, unsigned OpNo, "# + "const MCSubtargetInfo &STI, raw_ostream &O) { "# + "printNamedBit(MI, OpNo, O, \""#Id#"\"); }"; } -class NamedOperandU32_0<string Name, AsmOperandClass MatchClass> : - OperandWithDefaultOps<i32, (ops (i32 0))> { - let PrintMethod = "print"#Name; - let ParserMatchClass = MatchClass; +class DefaultOperand<CustomOperand Op, int Value> + : OperandWithDefaultOps<Op.Type, (ops (Op.Type Value))>, + CustomOperandProps<1, Op.ParserMatchClass.Name> { + let ParserMethod = Op.ParserMatchClass.ParserMethod; + let PrintMethod = Op.PrintMethod; } -class NamedOperandU32Default0<string Name, AsmOperandClass MatchClass> : - OperandWithDefaultOps<i32, (ops (i32 0))> { - let PrintMethod = "print"#Name; - let ParserMatchClass = MatchClass; +class SDWAOperand<string Id, string Name = NAME> + : CustomOperand<i32, 1, Name> { + let ParserMethod = + "[this](OperandVector &Operands) -> ParseStatus { "# + "return parseSDWASel(Operands, \""#Id#"\", AMDGPUOperand::"#ImmTy#"); }"; } -class NamedOperandU32Default1<string Name, AsmOperandClass MatchClass> : - OperandWithDefaultOps<i32, (ops (i32 1))> { - let PrintMethod = "print"#Name; - let ParserMatchClass = MatchClass; +class ArrayOperand0<string Id, string Name = NAME> + : OperandWithDefaultOps<i32, (ops (i32 0))>, + CustomOperandProps<1, Name> { + let ParserMethod = + "[this](OperandVector &Operands) -> ParseStatus { "# + "return parseOperandArrayWithPrefix(\""#Id#"\", Operands, "# + "AMDGPUOperand::"#ImmTy#"); }"; } -let OperandType = "OPERAND_IMMEDIATE" in { - -def flat_offset : CustomOperand<i16, 1, "FlatOffset">; -def offset : NamedIntOperand<i16, "offset", "Offset">; +let ImmTy = "ImmTyOffset" in +def flat_offset : CustomOperand<i32, 1, "FlatOffset">; +def offset : NamedIntOperand<i32, "offset", "Offset">; def offset0 : NamedIntOperand<i8, "offset0", "Offset0">; def offset1 : NamedIntOperand<i8, "offset1", "Offset1">; def gds : NamedBitOperand<"gds", "GDS">; -def omod : NamedOperandU32<"OModSI", NamedMatchClass<"OModSI">>; -def omod0 : NamedOperandU32_0<"OModSI", NamedMatchClass<"OModSI">>; +def omod : CustomOperand<i32, 1, "OModSI">; +def omod0 : DefaultOperand<omod, 0>; // We need to make the cases with a default of 0 distinct from no // default to help deal with some cases where the operand appears // before a mandatory operand. def clampmod : NamedBitOperand<"clamp", "ClampSI">; -def clampmod0 : DefaultOperand_0<clampmod>; +def clampmod0 : DefaultOperand<clampmod, 0>; def highmod : NamedBitOperand<"high", "High">; -def CPol : NamedOperandU32<"CPol", NamedMatchClass<"CPol">>; -def CPol_0 : NamedOperandU32Default0<"CPol", NamedMatchClass<"CPol">>; -def CPol_GLC1 : NamedOperandU32Default1<"CPol", NamedMatchClass<"CPol">>; +def CPol : CustomOperand<i32, 1>; +def CPol_0 : DefaultOperand<CPol, 0>; +def CPol_GLC1 : DefaultOperand<CPol, 1>; def TFE : NamedBitOperand<"tfe">; -def SWZ : NamedBitOperand<"swz">; -def SWZ_0 : DefaultOperand_0<SWZ>; def UNorm : NamedBitOperand<"unorm">; def DA : NamedBitOperand<"da">; def R128A16 : CustomOperand<i1, 1>; @@ -1267,62 +1112,51 @@ def FORMAT : CustomOperand<i8>; def DMask : NamedIntOperand<i16, "dmask">; def Dim : CustomOperand<i8>; -def dst_sel : NamedOperandU32<"SDWADstSel", NamedMatchClass<"SDWADstSel">>; -def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>; -def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>; -def dst_unused : NamedOperandU32<"SDWADstUnused", NamedMatchClass<"SDWADstUnused">>; - -def op_sel0 : NamedOperandU32Default0<"OpSel", NamedMatchClass<"OpSel">>; -def op_sel_hi0 : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>; -def neg_lo0 : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>; -def neg_hi0 : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>; +def dst_sel : SDWAOperand<"dst_sel", "SDWADstSel">; +def src0_sel : SDWAOperand<"src0_sel", "SDWASrc0Sel">; +def src1_sel : SDWAOperand<"src1_sel", "SDWASrc1Sel">; +def dst_unused : CustomOperand<i32, 1, "SDWADstUnused">; -def dpp8 : NamedOperandU32<"DPP8", NamedMatchClass<"DPP8", 0>>; -def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>; +def op_sel0 : ArrayOperand0<"op_sel", "OpSel">; +def op_sel_hi0 : ArrayOperand0<"op_sel_hi", "OpSelHi">; +def neg_lo0 : ArrayOperand0<"neg_lo", "NegLo">; +def neg_hi0 : ArrayOperand0<"neg_hi", "NegHi">; -def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>; -def bank_mask : NamedOperandU32<"BankMask", NamedMatchClass<"BankMask">>; -def bound_ctrl : NamedIntOperand<i1, "bound_ctrl", "DppBoundCtrl", "ConvertDppBoundCtrl">; -def FI : NamedOperandU32<"FI", NamedMatchClass<"FI">>; +def dpp8 : CustomOperand<i32, 0, "DPP8">; +def dpp_ctrl : CustomOperand<i32, 0, "DPPCtrl">; -def blgp : NamedOperandU32<"BLGP", NamedMatchClass<"BLGP">>; -def cbsz : NamedOperandU32<"CBSZ", NamedMatchClass<"CBSZ">>; -def abid : NamedOperandU32<"ABID", NamedMatchClass<"ABID">>; +let DefaultValue = "0xf" in { +def row_mask : NamedIntOperand<i32, "row_mask", "DppRowMask">; +def bank_mask : NamedIntOperand<i32, "bank_mask", "DppBankMask">; +} +def bound_ctrl : NamedIntOperand<i1, "bound_ctrl", "DppBoundCtrl", + "[this] (int64_t &BC) -> bool { return convertDppBoundCtrl(BC); }">; +def FI : NamedIntOperand<i32, "fi", "DppFI">; -def hwreg : NamedOperandU32<"Hwreg", NamedMatchClass<"Hwreg", 0>>; +def blgp : CustomOperand<i32, 1, "BLGP">; +def cbsz : NamedIntOperand<i32, "cbsz", "CBSZ">; +def abid : NamedIntOperand<i32, "abid", "ABID">; -def exp_tgt : NamedOperandU32<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> { +def hwreg : CustomOperand<i32, 0, "Hwreg">; -} +def exp_tgt : CustomOperand<i32, 0, "ExpTgt">; def wait_vdst : NamedIntOperand<i8, "wait_vdst", "WaitVDST">; def wait_exp : NamedIntOperand<i8, "wait_exp", "WaitEXP">; -} // End OperandType = "OPERAND_IMMEDIATE" - -class KImmMatchClass<int size> : AsmOperandClass { - let Name = "KImmFP"#size; - let PredicateMethod = "isKImmFP"#size; - let ParserMethod = "parseImm"; - let RenderMethod = "addKImmFP"#size#"Operands"; -} - -class kimmOperand<ValueType vt> : Operand<vt> { +class KImmFPOperand<ValueType vt> : ImmOperand<vt> { let OperandNamespace = "AMDGPU"; let OperandType = "OPERAND_KIMM"#vt.Size; let PrintMethod = "printU"#vt.Size#"ImmOperand"; - let ParserMatchClass = !cast<AsmOperandClass>("KImmFP"#vt.Size#"MatchClass"); - let DecoderMethod = "decodeOperand_f"#vt.Size#"kimm"; + let DecoderMethod = "decodeOperand_KImmFP"; } // 32-bit VALU immediate operand that uses the constant bus. -def KImmFP32MatchClass : KImmMatchClass<32>; -def f32kimm : kimmOperand<i32>; +def KImmFP32 : KImmFPOperand<i32>; // 32-bit VALU immediate operand with a 16-bit value that uses the // constant bus. -def KImmFP16MatchClass : KImmMatchClass<16>; -def f16kimm : kimmOperand<i16>; +def KImmFP16 : KImmFPOperand<i16>; class FPInputModsMatchClass <int opSize> : AsmOperandClass { let Name = "RegOrImmWithFP"#opSize#"InputMods"; @@ -1506,7 +1340,16 @@ def DS128Bit8ByteAligned : ComplexPattern<iPTR, 3, "SelectDS128Bit8ByteAligned"> def MOVRELOffset : ComplexPattern<iPTR, 2, "SelectMOVRELOffset">; def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">; + +// Modifiers for floating point instructions. def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">; + +// VOP3 modifiers used for instructions that do not read canonicalized +// floating point values (i.e. integer operations with FP source +// modifiers) +def VOP3ModsNonCanonicalizing : ComplexPattern<untyped, 2, + "SelectVOP3ModsNonCanonicalizing">; + def VOP3NoMods : ComplexPattern<untyped, 1, "SelectVOP3NoMods">; def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">; @@ -1521,7 +1364,8 @@ def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">; def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">; -def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">; +def VOP3PMadMixModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsExt">; +def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">; def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">; def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">; @@ -1717,7 +1561,7 @@ class getVOP3SrcForVT<ValueType VT> { bit isFP = isFloatType<VT>.ret; RegisterOperand ret = !if(!eq(VT.Size, 128), - VSrc_128, + VRegSrc_128, !if(!eq(VT.Size, 64), !if(isFP, !if(!eq(VT.Value, v2f32.Value), @@ -2390,14 +2234,6 @@ class getLdStRegisterOperand<RegisterClass RC> { ))))); } -class BitOr<bit a, bit b> { - bit ret = !if(a, 1, !if(b, 1, 0)); -} - -class BitAnd<bit a, bit b> { - bit ret = !if(a, !if(b, 1, 0), 0); -} - class getHasVOP3DPP <ValueType DstVT = i32, ValueType Src0VT = i32, ValueType Src1VT = i32, ValueType Src2VT = i32> { bit ret = !if(!eq(DstVT.Size, 64), @@ -2445,7 +2281,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret; field RegisterClass Src2DPP = getVregSrcForVT<Src2VT>.ret; field RegisterOperand Src0VOP3DPP = VGPRSrc_32; - field RegisterOperand Src1VOP3DPP = VGPRSrc_32; + field RegisterOperand Src1VOP3DPP = VRegSrc_32; field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT>.ret; field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret; field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret; @@ -2509,8 +2345,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; field bit HasExtVOP3DPP = getHasVOP3DPP<DstVT, Src0VT, Src1VT, Src2VT>.ret; - field bit HasExtDPP = !if(!or(getHasDPP<NumSrcArgs>.ret, - HasExtVOP3DPP), 1, 0); + field bit HasExtDPP = !or(getHasDPP<NumSrcArgs>.ret, HasExtVOP3DPP); field bit HasExt32BitDPP = getHasExt32BitDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; field bit HasExt64BitDPP = getHasExt64BitDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; field bit HasExtSDWA = getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 0c2a13852fcb..7fe76b4c13ca 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -47,7 +47,7 @@ let Uses = [MODE, M0, EXEC] in { multiclass V_INTERP_P1_F32_m : VINTRP_m < 0x00000000, (outs VINTRPDst:$vdst), - (ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan), + (ins VGPR_32:$vsrc, InterpAttr:$attr, InterpAttrChan:$attrchan), "v_interp_p1_f32$vdst, $vsrc, $attr$attrchan", [(set f32:$vdst, (int_amdgcn_interp_p1 f32:$vsrc, (i32 timm:$attrchan), (i32 timm:$attr), M0))] @@ -73,7 +73,8 @@ let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in { defm V_INTERP_P2_F32 : VINTRP_m < 0x00000001, (outs VINTRPDst:$vdst), - (ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan), + (ins VGPR_32:$src0, VGPR_32:$vsrc, InterpAttr:$attr, + InterpAttrChan:$attrchan), "v_interp_p2_f32$vdst, $vsrc, $attr$attrchan", [(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc, (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; @@ -83,7 +84,7 @@ defm V_INTERP_P2_F32 : VINTRP_m < defm V_INTERP_MOV_F32 : VINTRP_m < 0x00000002, (outs VINTRPDst:$vdst), - (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan), + (ins InterpSlot:$vsrc, InterpAttr:$attr, InterpAttrChan:$attrchan), "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan", [(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc), (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; @@ -95,6 +96,16 @@ defm V_INTERP_MOV_F32 : VINTRP_m < //===----------------------------------------------------------------------===// // Pseudo Instructions //===----------------------------------------------------------------------===// + +// Insert a branch to an endpgm block to use as a fallback trap. +def ENDPGM_TRAP : SPseudoInstSI< + (outs), (ins), + [(AMDGPUendpgm_trap)], + "ENDPGM_TRAP"> { + let hasSideEffects = 1; + let usesCustomInserter = 1; +} + def ATOMIC_FENCE : SPseudoInstSI< (outs), (ins i32imm:$ordering, i32imm:$scope), [(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))], @@ -161,6 +172,13 @@ def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] +def WWM_COPY : SPseudoInstSI < + (outs unknown:$dst), (ins unknown:$src)> { + let hasSideEffects = 0; + let isAsCheapAsAMove = 1; + let isConvergent = 1; +} + def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { let Uses = [EXEC]; let Defs = [EXEC, SCC]; @@ -189,6 +207,12 @@ def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { let mayStore = 0; } +let usesCustomInserter = 1 in { +def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins SSrc_b32:$mask)>; + +def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins SSrc_b64:$mask)>; +} // End usesCustomInserter = 1 + // PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes. def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> { let Uses = [EXEC]; @@ -222,7 +246,7 @@ def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), // Invert the exec mask and overwrite the inactive lanes of dst with inactive, // restoring it after we're done. -let Defs = [SCC] in { +let Defs = [SCC], isConvergent = 1 in { def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), (ins VSrc_b32: $src, VSrc_b32:$inactive), [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> { @@ -234,6 +258,18 @@ def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), } } // End Defs = [SCC] +let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { + def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> { + } + + def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), + (ins VSrc_b32: $src, VSrc_b32:$strategy), + [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> { + } +} + let usesCustomInserter = 1, Defs = [VCC, EXEC] in { def V_ADD_U64_PSEUDO : VPseudoInstSI < (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), @@ -300,6 +336,7 @@ def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>; def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>; def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>; def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>; +def S_AND_SAVEEXEC_B64_term : WrapTerminatorInst<S_AND_SAVEEXEC_B64>; } let WaveSizePredicate = isWave32 in { @@ -308,6 +345,7 @@ def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>; def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>; def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>; def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>; +def S_AND_SAVEEXEC_B32_term : WrapTerminatorInst<S_AND_SAVEEXEC_B32>; } @@ -368,7 +406,13 @@ def IGLP_OPT : SPseudoInstSI<(outs), (ins i32imm:$mask), // SI pseudo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. -let isTerminator = 1 in { +// As we have enhanced control flow intrinsics to work under unstructured CFG, +// duplicating such intrinsics can be actually treated as legal. On the contrary, +// by making them non-duplicable, we are observing better code generation result. +// So we choose to mark them non-duplicable in hope of getting better code +// generation as well as simplied CFG during Machine IR optimization stage. + +let isTerminator = 1, isNotDuplicable = 1 in { let OtherPredicates = [EnableLateCFGStructurize] in { def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI < @@ -385,6 +429,7 @@ def SI_IF: CFPseudoInstSI < let Constraints = ""; let Size = 12; let hasSideEffects = 1; + let IsNeverUniform = 1; } def SI_ELSE : CFPseudoInstSI < @@ -392,6 +437,7 @@ def SI_ELSE : CFPseudoInstSI < (ins SReg_1:$src, brtarget:$target), [], 1, 1> { let Size = 12; let hasSideEffects = 1; + let IsNeverUniform = 1; } def SI_WATERFALL_LOOP : CFPseudoInstSI < @@ -408,6 +454,7 @@ def SI_LOOP : CFPseudoInstSI < let Size = 8; let isBranch = 1; let hasSideEffects = 1; + let IsNeverUniform = 1; } } // End isTerminator = 1 @@ -418,6 +465,7 @@ def SI_END_CF : CFPseudoInstSI < let isAsCheapAsAMove = 1; let isReMaterializable = 1; let hasSideEffects = 1; + let isNotDuplicable = 1; // Not a hard requirement, see long comments above for details. let mayLoad = 1; // FIXME: Should not need memory flags let mayStore = 1; } @@ -425,6 +473,7 @@ def SI_END_CF : CFPseudoInstSI < def SI_IF_BREAK : CFPseudoInstSI < (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> { let Size = 4; + let isNotDuplicable = 1; // Not a hard requirement, see long comments above for details. let isAsCheapAsAMove = 1; let isReMaterializable = 1; } @@ -470,7 +519,7 @@ def SI_ILLEGAL_COPY : SPseudoInstSI < // Branch on undef scc. Used to avoid intermediate copy from // IMPLICIT_DEF to SCC. -def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> { +def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins SOPPBrTarget:$simm16)> { let isTerminator = 1; let usesCustomInserter = 1; let isBranch = 1; @@ -543,7 +592,7 @@ def SI_RETURN_TO_EPILOG : SPseudoInstSI < // Return for returning function calls. def SI_RETURN : SPseudoInstSI < - (outs), (ins), [(AMDGPUret_flag)], + (outs), (ins), [(AMDGPUret_glue)], "; return"> { let isTerminator = 1; let isBarrier = 1; @@ -584,10 +633,9 @@ def SI_CALL : SPseudoInstSI < let isConvergent = 1; } -// Tail call handling pseudo -def SI_TCRETURN : SPseudoInstSI <(outs), - (ins SReg_64:$src0, unknown:$callee, i32imm:$fpdiff), - [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> { +class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs), + (ins rc:$src0, unknown:$callee, i32imm:$fpdiff), + [(sd i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> { let Size = 4; let FixedSize = 1; let isCall = 1; @@ -600,10 +648,20 @@ def SI_TCRETURN : SPseudoInstSI <(outs), let isConvergent = 1; } +// Tail call handling pseudo +def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64, AMDGPUtc_return>; +def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64, AMDGPUtc_return_gfx>; + // Handle selecting indirect tail calls def : GCNPat< (AMDGPUtc_return i64:$src0, (i64 0), (i32 timm:$fpdiff)), - (SI_TCRETURN SReg_64:$src0, (i64 0), i32imm:$fpdiff) + (SI_TCRETURN CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff) +>; + +// Handle selecting indirect tail calls for AMDGPU_gfx +def : GCNPat< + (AMDGPUtc_return_gfx i64:$src0, (i64 0), (i32 timm:$fpdiff)), + (SI_TCRETURN_GFX Gfx_CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff) >; def ADJCALLSTACKUP : SPseudoInstSI< @@ -720,6 +778,10 @@ def S_INDIRECT_REG_WRITE_MOVREL_B32_V3 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo< def S_INDIRECT_REG_WRITE_MOVREL_B32_V4 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_128>; def S_INDIRECT_REG_WRITE_MOVREL_B32_V5 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_160>; def S_INDIRECT_REG_WRITE_MOVREL_B32_V8 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_256>; +def S_INDIRECT_REG_WRITE_MOVREL_B32_V9 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_288>; +def S_INDIRECT_REG_WRITE_MOVREL_B32_V10 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_320>; +def S_INDIRECT_REG_WRITE_MOVREL_B32_V11 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_352>; +def S_INDIRECT_REG_WRITE_MOVREL_B32_V12 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_384>; def S_INDIRECT_REG_WRITE_MOVREL_B32_V16 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_512>; def S_INDIRECT_REG_WRITE_MOVREL_B32_V32 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_1024>; @@ -890,6 +952,9 @@ defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384, 1>; defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>; defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>; +let isConvergent = 1 in +defm SI_SPILL_WWM_V32 : SI_SPILL_VGPR <VGPR_32>; + def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < (outs SReg_64:$dst), (ins si_ga:$ptr_lo, si_ga:$ptr_hi), @@ -954,25 +1019,6 @@ def : Pat < // VOP1 Patterns //===----------------------------------------------------------------------===// -let OtherPredicates = [UnsafeFPMath] in { - -// Convert (x - floor(x)) to fract(x) -def : GCNPat < - (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), - (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), - (V_FRACT_F32_e64 $mods, $x) ->; - -// Convert (x + (-floor(x))) to fract(x) -def : GCNPat < - (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), - (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), - (V_FRACT_F64_e64 $mods, $x) ->; - -} // End OtherPredicates = [UnsafeFPMath] - - multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> { // f16_to_fp patterns def : GCNPat < @@ -1094,8 +1140,8 @@ def : GCNPat < >; class VOPSelectModsPat <ValueType vt> : GCNPat < - (vt (select i1:$src0, (VOP3Mods vt:$src1, i32:$src1_mods), - (VOP3Mods vt:$src2, i32:$src2_mods))), + (vt (select i1:$src0, (VOP3ModsNonCanonicalizing vt:$src1, i32:$src1_mods), + (VOP3ModsNonCanonicalizing vt:$src2, i32:$src2_mods))), (V_CNDMASK_B32_e64 FP32InputMods:$src2_mods, VSrc_b32:$src2, FP32InputMods:$src1_mods, VSrc_b32:$src1, SSrc_i1:$src0) >; @@ -1343,66 +1389,6 @@ foreach Index = 0-15 in { } -def : Pat < - (extract_subvector v4i16:$vec, (i32 0)), - (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub0)) ->; - -def : Pat < - (extract_subvector v4i16:$vec, (i32 2)), - (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub1)) ->; - -def : Pat < - (extract_subvector v4f16:$vec, (i32 0)), - (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0)) ->; - -def : Pat < - (extract_subvector v4f16:$vec, (i32 2)), - (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1)) ->; - -def : Pat < - (extract_subvector v8i16:$vec, (i32 0)), - (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub0_sub1)) ->; - -def : Pat < - (extract_subvector v8i16:$vec, (i32 4)), - (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub2_sub3)) ->; - -def : Pat < - (extract_subvector v8f16:$vec, (i32 0)), - (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub0_sub1)) ->; - -def : Pat < - (extract_subvector v8f16:$vec, (i32 4)), - (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub2_sub3)) ->; - -def : Pat < - (extract_subvector v16i16:$vec, (i32 0)), - (v8i16 (EXTRACT_SUBREG v16i16:$vec, sub0_sub1_sub2_sub3)) ->; - -def : Pat < - (extract_subvector v16i16:$vec, (i32 8)), - (v8i16 (EXTRACT_SUBREG v16i16:$vec, sub4_sub5_sub6_sub7)) ->; - -def : Pat < - (extract_subvector v16f16:$vec, (i32 0)), - (v8f16 (EXTRACT_SUBREG v16f16:$vec, sub0_sub1_sub2_sub3)) ->; - -def : Pat < - (extract_subvector v16f16:$vec, (i32 8)), - (v8f16 (EXTRACT_SUBREG v16f16:$vec, sub4_sub5_sub6_sub7)) ->; - foreach Index = 0-31 in { def Extract_Element_v32i32_#Index : Extract_Element < i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) @@ -2002,13 +1988,13 @@ def : GCNPat < def : GCNPat < (i32 (sext i1:$src0)), (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), - /*src1mod*/(i32 0), /*src1*/(i32 -1), $src0) + /*src1mod*/(i32 0), /*src1*/(i32 -1), i1:$src0) >; class Ext32Pat <SDNode ext> : GCNPat < (i32 (ext i1:$src0)), (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), - /*src1mod*/(i32 0), /*src1*/(i32 1), $src0) + /*src1mod*/(i32 0), /*src1*/(i32 1), i1:$src0) >; def : Ext32Pat <zext>; @@ -2043,48 +2029,53 @@ def BFIImm32 : PatFrag< }] >; + // Definition from ISA doc: // (y & x) | (z & ~x) -def : AMDGPUPat < +def : AMDGPUPatIgnoreCopies < (DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), - (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) + (V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), + (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32), + (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32)) >; // (y & C) | (z & ~C) -def : AMDGPUPat < +def : AMDGPUPatIgnoreCopies < (BFIImm32 i32:$x, i32:$y, i32:$z), (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) >; // 64-bit version -def : AMDGPUPat < +def : AMDGPUPatIgnoreCopies < (DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))), (REG_SEQUENCE VReg_64, (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), - (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), - (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, + (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), - (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), - (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) + (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) >; // SHA-256 Ch function // z ^ (x & (y ^ z)) -def : AMDGPUPat < +def : AMDGPUPatIgnoreCopies < (DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), - (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) + (V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), + (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32), + (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32)) >; // 64-bit version -def : AMDGPUPat < +def : AMDGPUPatIgnoreCopies < (DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))), (REG_SEQUENCE VReg_64, (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), - (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), - (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, + (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), - (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), - (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) + (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) >; def : AMDGPUPat < @@ -3185,24 +3176,27 @@ def : AMDGPUPat < // SHA-256 Ma patterns // ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y -def : AMDGPUPat < +def : AMDGPUPatIgnoreCopies < (DivergentBinFrag<or> (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))), - (V_BFI_B32_e64 (V_XOR_B32_e64 VSrc_b32:$x, VSrc_b32:$y), VSrc_b32:$z, VSrc_b32:$y) + (V_BFI_B32_e64 (V_XOR_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), + (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32)), + (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32), + (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32)) >; -def : AMDGPUPat < +def : AMDGPUPatIgnoreCopies < (DivergentBinFrag<or> (and i64:$x, i64:$z), (and i64:$y, (or i64:$x, i64:$z))), (REG_SEQUENCE VReg_64, (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), - (i32 (EXTRACT_SUBREG VReg_64:$z, sub0)), - (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), sub0, + (i32 (EXTRACT_SUBREG VReg_64:$z, sub0)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), sub0, (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), - (i32 (EXTRACT_SUBREG VReg_64:$z, sub1)), - (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), sub1) + (i32 (EXTRACT_SUBREG VReg_64:$z, sub1)), + (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), sub1) >; multiclass IntMed3Pat<Instruction med3Inst, @@ -3486,8 +3480,6 @@ def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction { } let Namespace = "AMDGPU" in { -def G_AMDGPU_ATOMIC_INC : G_ATOMICRMW_OP; -def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP; def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP; def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP; } @@ -3614,15 +3606,6 @@ def G_FPTRUNC_ROUND_DOWNWARD : AMDGPUGenericInstruction { // Dummy Instructions //============================================================================// -def V_ILLEGAL_gfx6_gfx7_gfx8_gfx9 : Enc32, InstSI<(outs), (ins), "v_illegal"> { - let Inst{31-0} = 0xFFFFFFFF; - let FixedSize = 1; - let Size = 4; - let Uses = [EXEC]; - let hasSideEffects = 1; - let SubtargetPredicate = isGFX6GFX7GFX8GFX9; -} - def V_ILLEGAL : Enc32, InstSI<(outs), (ins), "v_illegal"> { let Inst{31-0} = 0x00000000; let FixedSize = 1; diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 2b5ca33b0e4f..c252d30e250e 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -331,7 +331,6 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { switch (Opc) { case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: - case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::GLOBAL_LOAD_DWORD: @@ -342,7 +341,6 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::FLAT_STORE_DWORD: return 1; case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: - case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: case AMDGPU::GLOBAL_LOAD_DWORDX2: @@ -360,7 +358,6 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::FLAT_STORE_DWORDX3: return 3; case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: - case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::GLOBAL_LOAD_DWORDX4: @@ -371,7 +368,6 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::FLAT_STORE_DWORDX4: return 4; case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: - case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: return 8; @@ -432,6 +428,10 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN: + case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact: return TBUFFER_LOAD; case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: @@ -446,12 +446,6 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return S_BUFFER_LOAD_IMM; - // For the purposes of this optimization SGPR variants of buffer loads - // are considered to be zero-offsetted SGPR_IMM loads. - case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: @@ -533,12 +527,6 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; - // For the purposes of this optimization SGPR variants of buffer loads - // are considered to be zero-offsetted SGPR_IMM loads. - case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: @@ -641,10 +629,6 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { switch (Opc) { default: return Result; - case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: - case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: @@ -740,7 +724,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, Offset = 0; } else { int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); - Offset = OffsetIdx == -1 ? 0 : I->getOperand(OffsetIdx).getImm(); + Offset = I->getOperand(OffsetIdx).getImm(); } if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) @@ -887,7 +871,7 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, unsigned MaxMask = std::max(CI.DMask, Paired.DMask); unsigned MinMask = std::min(CI.DMask, Paired.DMask); - unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); + unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask); if ((1u << AllowedBitsForMin) <= MinMask) return false; @@ -926,7 +910,7 @@ static unsigned getBufferFormatWithCompCount(unsigned OldFormat, // - if Lo == 0, return 0 (even though the "- 1" below underflows // - if Lo > Hi, return 0 (as if the range wrapped around) static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { - return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1); + return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1); } bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, @@ -975,9 +959,12 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, // Handle all non-DS instructions. if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { - return (EltOffset0 + CI.Width == EltOffset1 || - EltOffset1 + Paired.Width == EltOffset0) && - CI.CPol == Paired.CPol; + if (EltOffset0 + CI.Width != EltOffset1 && + EltOffset1 + Paired.Width != EltOffset0) + return false; + if (CI.CPol != Paired.CPol) + return false; + return true; } // If the offset in elements doesn't fit in 8-bits, we might be able to use @@ -1383,10 +1370,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)); if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)); - // For convenience, when SGPR_IMM buffer loads are merged into a - // zero-offset load, we generate its SGPR variant. - if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::offset)) - New.addImm(MergedOffset); + New.addImm(MergedOffset); New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired)); std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); @@ -1697,14 +1681,11 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, default: return 0; case 2: - return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR - : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; + return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM; case 4: - return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR - : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; + return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM; case 8: - return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR - : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; + return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; } case S_LOAD_IMM: switch (Width) { @@ -2092,7 +2073,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( // Step1: Find the base-registers and a 64bit constant offset. MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); MemAddress MAddr; - if (Visited.find(&MI) == Visited.end()) { + if (!Visited.contains(&MI)) { processBaseWithConstOffset(Base, MAddr); Visited[&MI] = MAddr; } else @@ -2155,7 +2136,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( const MachineOperand &BaseNext = *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); MemAddress MAddrNext; - if (Visited.find(&MINext) == Visited.end()) { + if (!Visited.contains(&MINext)) { processBaseWithConstOffset(BaseNext, MAddrNext); Visited[&MINext] = MAddrNext; } else diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 67077a2eaa6b..00cb5b2878f4 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -427,6 +427,8 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) { BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), Exec) .addReg(Exec) .add(MI.getOperand(0)); + if (LV) + LV->replaceKillInstruction(MI.getOperand(0).getReg(), MI, *AndN2); auto BranchPt = skipToUncondBrOrEnd(MBB, MI.getIterator()); MachineInstr *Branch = @@ -514,13 +516,18 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) { LV->replaceKillInstruction(DataReg, MI, *NewMI); if (SplitBB != &MBB) { - // Track the set of registers defined in the split block so we don't - // accidentally add the original block to AliveBlocks. - DenseSet<Register> SplitDefs; - for (MachineInstr &X : *SplitBB) { - for (MachineOperand &Op : X.operands()) { - if (Op.isReg() && Op.isDef() && Op.getReg().isVirtual()) - SplitDefs.insert(Op.getReg()); + // Track the set of registers defined in the original block so we don't + // accidentally add the original block to AliveBlocks. AliveBlocks only + // includes blocks which are live through, which excludes live outs and + // local defs. + DenseSet<Register> DefInOrigBlock; + + for (MachineBasicBlock *BlockPiece : {&MBB, SplitBB}) { + for (MachineInstr &X : *BlockPiece) { + for (MachineOperand &Op : X.all_defs()) { + if (Op.getReg().isVirtual()) + DefInOrigBlock.insert(Op.getReg()); + } } } @@ -532,7 +539,7 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) { VI.AliveBlocks.set(SplitBB->getNumber()); else { for (MachineInstr *Kill : VI.Kills) { - if (Kill->getParent() == SplitBB && !SplitDefs.contains(Reg)) + if (Kill->getParent() == SplitBB && !DefInOrigBlock.contains(Reg)) VI.AliveBlocks.set(MBB.getNumber()); } } diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 3450a9f0681f..47d28d5d0eab 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -50,7 +50,9 @@ public: SILowerSGPRSpills() : MachineFunctionPass(ID) {} void calculateSaveRestoreBlocks(MachineFunction &MF); - bool spillCalleeSavedRegs(MachineFunction &MF); + bool spillCalleeSavedRegs(MachineFunction &MF, + SmallVectorImpl<int> &CalleeSavedFIs); + void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS); bool runOnMachineFunction(MachineFunction &MF) override; @@ -58,6 +60,13 @@ public: AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } + + MachineFunctionProperties getClearedProperties() const override { + // SILowerSGPRSpills introduces new Virtual VGPRs for spilling SGPRs. + return MachineFunctionProperties() + .set(MachineFunctionProperties::Property::IsSSA) + .set(MachineFunctionProperties::Property::NoVRegs); + } }; } // end anonymous namespace @@ -197,7 +206,8 @@ static void updateLiveness(MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI) { EntryBB.sortUniqueLiveIns(); } -bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { +bool SILowerSGPRSpills::spillCalleeSavedRegs( + MachineFunction &MF, SmallVectorImpl<int> &CalleeSavedFIs) { MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = MF.getFunction(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); @@ -228,6 +238,7 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { TRI->getSpillAlign(*RC), true); CSI.push_back(CalleeSavedInfo(Reg, JunkFI)); + CalleeSavedFIs.push_back(JunkFI); } } @@ -248,6 +259,50 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { return false; } +void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF, + LiveIntervals *LIS) { + // TODO: This is a workaround to avoid the unmodelled liveness computed with + // whole-wave virtual registers when allocated together with the regular VGPR + // virtual registers. Presently, the liveness computed during the regalloc is + // only uniform (or single lane aware) and it doesn't take account of the + // divergent control flow that exists for our GPUs. Since the WWM registers + // can modify inactive lanes, the wave-aware liveness should be computed for + // the virtual registers to accurately plot their interferences. Without + // having the divergent CFG for the function, it is difficult to implement the + // wave-aware liveness info. Until then, we conservatively extend the liveness + // of the wwm registers into the entire function so that they won't be reused + // without first spilling/splitting their liveranges. + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + // Insert the IMPLICIT_DEF for the wwm-registers in the entry blocks. + for (auto Reg : MFI->getSGPRSpillVGPRs()) { + for (MachineBasicBlock *SaveBlock : SaveBlocks) { + MachineBasicBlock::iterator InsertBefore = SaveBlock->begin(); + auto MIB = BuildMI(*SaveBlock, *InsertBefore, InsertBefore->getDebugLoc(), + TII->get(AMDGPU::IMPLICIT_DEF), Reg); + MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG); + if (LIS) { + LIS->InsertMachineInstrInMaps(*MIB); + } + } + } + + // Insert the KILL in the return blocks to extend their liveness untill the + // end of function. Insert a separate KILL for each VGPR. + for (MachineBasicBlock *RestoreBlock : RestoreBlocks) { + MachineBasicBlock::iterator InsertBefore = + RestoreBlock->getFirstTerminator(); + for (auto Reg : MFI->getSGPRSpillVGPRs()) { + auto MIB = + BuildMI(*RestoreBlock, *InsertBefore, InsertBefore->getDebugLoc(), + TII->get(TargetOpcode::KILL)); + MIB.addReg(Reg); + if (LIS) + LIS->InsertMachineInstrInMaps(*MIB); + } + } +} + bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); @@ -261,7 +316,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { // First, expose any CSR SGPR spills. This is mostly the same as what PEI // does, but somewhat simpler. calculateSaveRestoreBlocks(MF); - bool HasCSRs = spillCalleeSavedRegs(MF); + SmallVector<int> CalleeSavedFIs; + bool HasCSRs = spillCalleeSavedRegs(MF, CalleeSavedFIs); MachineFrameInfo &MFI = MF.getFrameInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -275,6 +331,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { bool MadeChange = false; bool NewReservedRegs = false; + bool SpilledToVirtVGPRLanes = false; // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be // handled as SpilledToReg in regular PrologEpilogInserter. @@ -297,23 +354,53 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); - if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) { - NewReservedRegs = true; - bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( - MI, FI, nullptr, Indexes, LIS); - (void)Spilled; - assert(Spilled && "failed to spill SGPR to VGPR when allocated"); - SpillFIs.set(FI); + + bool IsCalleeSaveSGPRSpill = + std::find(CalleeSavedFIs.begin(), CalleeSavedFIs.end(), FI) != + CalleeSavedFIs.end(); + if (IsCalleeSaveSGPRSpill) { + // Spill callee-saved SGPRs into physical VGPR lanes. + + // TODO: This is to ensure the CFIs are static for efficient frame + // unwinding in the debugger. Spilling them into virtual VGPR lanes + // involve regalloc to allocate the physical VGPRs and that might + // cause intermediate spill/split of such liveranges for successful + // allocation. This would result in broken CFI encoding unless the + // regalloc aware CFI generation to insert new CFIs along with the + // intermediate spills is implemented. There is no such support + // currently exist in the LLVM compiler. + if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) { + NewReservedRegs = true; + bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( + MI, FI, nullptr, Indexes, LIS, true); + if (!Spilled) + llvm_unreachable( + "failed to spill SGPR to physical VGPR lane when allocated"); + } + } else { + if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) { + bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( + MI, FI, nullptr, Indexes, LIS); + if (!Spilled) + llvm_unreachable( + "failed to spill SGPR to virtual VGPR lane when allocated"); + SpillFIs.set(FI); + SpilledToVirtVGPRLanes = true; + } } } } - // FIXME: Adding to live-ins redundant with reserving registers. - for (MachineBasicBlock &MBB : MF) { - for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) - MBB.addLiveIn(Reg); - MBB.sortUniqueLiveIns(); + if (SpilledToVirtVGPRLanes) { + extendWWMVirtRegLiveness(MF, LIS); + if (LIS) { + // Compute the LiveInterval for the newly created virtual registers. + for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) + LIS->createAndComputeVirtRegInterval(Reg); + } + } + for (MachineBasicBlock &MBB : MF) { // FIXME: The dead frame indices are replaced with a null register from // the debug value instructions. We should instead, update it with the // correct register value. But not sure the register value alone is @@ -337,12 +424,30 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { MadeChange = true; } + if (SpilledToVirtVGPRLanes) { + const TargetRegisterClass *RC = TRI->getWaveMaskRegClass(); + // Shift back the reserved SGPR for EXEC copy into the lowest range. + // This SGPR is reserved to handle the whole-wave spill/copy operations + // that might get inserted during vgpr regalloc. + Register UnusedLowSGPR = TRI->findUnusedRegister(MRI, RC, MF); + if (UnusedLowSGPR && TRI->getHWRegIndex(UnusedLowSGPR) < + TRI->getHWRegIndex(FuncInfo->getSGPRForEXECCopy())) + FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR); + } else { + // No SGPR spills to virtual VGPR lanes and hence there won't be any WWM + // spills/copies. Reset the SGPR reserved for EXEC copy. + FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister); + } + SaveBlocks.clear(); RestoreBlocks.clear(); - // Updated the reserved registers with any VGPRs added for SGPR spills. - if (NewReservedRegs) - MRI.freezeReservedRegs(MF); + // Updated the reserved registers with any physical VGPRs added for SGPR + // spills. + if (NewReservedRegs) { + for (Register Reg : FuncInfo->getWWMReservedRegs()) + MRI.reserveReg(Reg, TRI); + } return MadeChange; } diff --git a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp new file mode 100644 index 000000000000..9c3cd1bbd6b0 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp @@ -0,0 +1,141 @@ +//===-- SILowerWWMCopies.cpp - Lower Copies after regalloc ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Lowering the WWM_COPY instructions for various register classes. +/// AMDGPU target generates WWM_COPY instruction to differentiate WWM +/// copy from COPY. This pass generates the necessary exec mask manipulation +/// instructions to replicate 'Whole Wave Mode' and lowers WWM_COPY back to +/// COPY. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-lower-wwm-copies" + +namespace { + +class SILowerWWMCopies : public MachineFunctionPass { +public: + static char ID; + + SILowerWWMCopies() : MachineFunctionPass(ID) { + initializeSILowerWWMCopiesPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "SI Lower WWM Copies"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + bool isSCCLiveAtMI(const MachineInstr &MI); + void addToWWMSpills(MachineFunction &MF, Register Reg); + + LiveIntervals *LIS; + SlotIndexes *Indexes; + VirtRegMap *VRM; + const SIRegisterInfo *TRI; + const MachineRegisterInfo *MRI; + SIMachineFunctionInfo *MFI; +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SILowerWWMCopies, DEBUG_TYPE, "SI Lower WWM Copies", + false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_END(SILowerWWMCopies, DEBUG_TYPE, "SI Lower WWM Copies", false, + false) + +char SILowerWWMCopies::ID = 0; + +char &llvm::SILowerWWMCopiesID = SILowerWWMCopies::ID; + +bool SILowerWWMCopies::isSCCLiveAtMI(const MachineInstr &MI) { + // We can't determine the liveness info if LIS isn't available. Early return + // in that case and always assume SCC is live. + if (!LIS) + return true; + + LiveRange &LR = + LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI)); + SlotIndex Idx = LIS->getInstructionIndex(MI); + return LR.liveAt(Idx); +} + +// If \p Reg is assigned with a physical VGPR, add the latter into wwm-spills +// for preserving its entire lanes at function prolog/epilog. +void SILowerWWMCopies::addToWWMSpills(MachineFunction &MF, Register Reg) { + if (Reg.isPhysical()) + return; + + Register PhysReg = VRM->getPhys(Reg); + assert(PhysReg != VirtRegMap::NO_PHYS_REG && + "should have allocated a physical register"); + + MFI->allocateWWMSpill(MF, PhysReg); +} + +bool SILowerWWMCopies::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + MFI = MF.getInfo<SIMachineFunctionInfo>(); + LIS = getAnalysisIfAvailable<LiveIntervals>(); + Indexes = getAnalysisIfAvailable<SlotIndexes>(); + VRM = getAnalysisIfAvailable<VirtRegMap>(); + TRI = ST.getRegisterInfo(); + MRI = &MF.getRegInfo(); + + if (!MFI->hasVRegFlags()) + return false; + + bool Changed = false; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() != AMDGPU::WWM_COPY) + continue; + + // TODO: Club adjacent WWM ops between same exec save/restore + assert(TII->isVGPRCopy(MI)); + + // For WWM vector copies, manipulate the exec mask around the copy + // instruction. + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock::iterator InsertPt = MI.getIterator(); + Register RegForExecCopy = MFI->getSGPRForEXECCopy(); + TII->insertScratchExecCopy(MF, MBB, InsertPt, DL, RegForExecCopy, + isSCCLiveAtMI(MI), Indexes); + TII->restoreExec(MF, MBB, ++InsertPt, DL, RegForExecCopy, Indexes); + addToWWMSpills(MF, MI.getOperand(0).getReg()); + LLVM_DEBUG(dbgs() << "WWM copy manipulation for " << MI); + + // Lower WWM_COPY back to COPY + MI.setDesc(TII->get(AMDGPU::COPY)); + Changed |= true; + } + } + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index b2a433dd3db9..219464eac9ec 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -65,6 +65,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, Occupancy = ST.computeOccupancy(F, getLDSSize()); CallingConv::ID CC = F.getCallingConv(); + VRegFlags.reserve(1024); + // FIXME: Should have analysis or something rather than attribute to detect // calls. const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); @@ -119,7 +121,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, else if (ST.isMesaGfxShader(F)) ImplicitBufferPtr = true; - if (!AMDGPU::isGraphics(CC)) { + if (!AMDGPU::isGraphics(CC) || + (CC == CallingConv::AMDGPU_CS && ST.hasArchitectedSGPRs())) { if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x")) WorkGroupIDX = true; @@ -128,7 +131,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, if (!F.hasFnAttribute("amdgpu-no-workgroup-id-z")) WorkGroupIDZ = true; + } + if (!AMDGPU::isGraphics(CC)) { if (IsKernel || !F.hasFnAttribute("amdgpu-no-workitem-id-x")) WorkItemIDX = true; @@ -309,37 +314,23 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs, return false; } -bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF, - int FI, - unsigned LaneIndex) { - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); +bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills( + MachineFunction &MF, int FI, unsigned LaneIndex) { MachineRegisterInfo &MRI = MF.getRegInfo(); Register LaneVGPR; if (!LaneIndex) { - LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); - if (LaneVGPR == AMDGPU::NoRegister) { - // We have no VGPRs left for spilling SGPRs. Reset because we will not - // partially spill the SGPR to VGPRs. - SGPRSpillToVGPRLanes.erase(FI); - return false; - } - + LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); SpillVGPRs.push_back(LaneVGPR); - // Add this register as live-in to all blocks to avoid machine verifier - // complaining about use of an undefined physical register. - for (MachineBasicBlock &BB : MF) - BB.addLiveIn(LaneVGPR); } else { LaneVGPR = SpillVGPRs.back(); } - SGPRSpillToVGPRLanes[FI].push_back( + SGPRSpillsToVirtualVGPRLanes[FI].push_back( SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex)); return true; } -bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills( +bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills( MachineFunction &MF, int FI, unsigned LaneIndex) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -350,16 +341,21 @@ bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills( if (LaneVGPR == AMDGPU::NoRegister) { // We have no VGPRs left for spilling SGPRs. Reset because we will not // partially spill the SGPR to VGPRs. - PrologEpilogSGPRSpillToVGPRLanes.erase(FI); + SGPRSpillsToPhysicalVGPRLanes.erase(FI); return false; } allocateWWMSpill(MF, LaneVGPR); + reserveWWMRegister(LaneVGPR); + for (MachineBasicBlock &MBB : MF) { + MBB.addLiveIn(LaneVGPR); + MBB.sortUniqueLiveIns(); + } } else { - LaneVGPR = WWMSpills.back().first; + LaneVGPR = WWMReservedRegs.back(); } - PrologEpilogSGPRSpillToVGPRLanes[FI].push_back( + SGPRSpillsToPhysicalVGPRLanes[FI].push_back( SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex)); return true; } @@ -368,8 +364,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, bool IsPrologEpilog) { std::vector<SIRegisterInfo::SpilledReg> &SpillLanes = - IsPrologEpilog ? PrologEpilogSGPRSpillToVGPRLanes[FI] - : SGPRSpillToVGPRLanes[FI]; + IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI] + : SGPRSpillsToVirtualVGPRLanes[FI]; // This has already been allocated. if (!SpillLanes.empty()) @@ -390,15 +386,14 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF, "not spilling SGPRs to VGPRs"); unsigned &NumSpillLanes = - IsPrologEpilog ? NumVGPRPrologEpilogSpillLanes : NumVGPRSpillLanes; + IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes; for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) { unsigned LaneIndex = (NumSpillLanes % WaveSize); - bool Allocated = - IsPrologEpilog - ? allocateVGPRForPrologEpilogSGPRSpills(MF, FI, LaneIndex) - : allocateVGPRForSGPRSpills(MF, FI, LaneIndex); + bool Allocated = IsPrologEpilog + ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex) + : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex); if (!Allocated) { NumSpillLanes -= I; return false; @@ -479,16 +474,25 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF, bool SIMachineFunctionInfo::removeDeadFrameIndices( MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) { - // Remove dead frame indices from function frame. And also make sure to remove - // the frame indices from `SGPRSpillToVGPRLanes` data structure, otherwise, it - // could result in an unexpected side effect and bug, in case of any - // re-mapping of freed frame indices by later pass(es) like "stack slot + // Remove dead frame indices from function frame, however keep FP & BP since + // spills for them haven't been inserted yet. And also make sure to remove the + // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure, + // otherwise, it could result in an unexpected side effect and bug, in case of + // any re-mapping of freed frame indices by later pass(es) like "stack slot // coloring". - for (auto &R : make_early_inc_range(SGPRSpillToVGPRLanes)) { + for (auto &R : make_early_inc_range(SGPRSpillsToVirtualVGPRLanes)) { MFI.RemoveStackObject(R.first); - SGPRSpillToVGPRLanes.erase(R.first); + SGPRSpillsToVirtualVGPRLanes.erase(R.first); } + // Remove the dead frame indices of CSR SGPRs which are spilled to physical + // VGPR lanes during SILowerSGPRSpills pass. + if (!ResetSGPRSpillStackIDs) { + for (auto &R : make_early_inc_range(SGPRSpillsToPhysicalVGPRLanes)) { + MFI.RemoveStackObject(R.first); + SGPRSpillsToPhysicalVGPRLanes.erase(R.first); + } + } bool HaveSGPRToMemory = false; if (ResetSGPRSpillStackIDs) { @@ -537,6 +541,16 @@ MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const { return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; } +void SIMachineFunctionInfo::MRI_NoteNewVirtualRegister(Register Reg) { + VRegFlags.grow(Reg); +} + +void SIMachineFunctionInfo::MRI_NoteCloneVirtualRegister(Register NewReg, + Register SrcReg) { + VRegFlags.grow(NewReg); + VRegFlags[NewReg] = VRegFlags[SrcReg]; +} + Register SIMachineFunctionInfo::getGITPtrLoReg(const MachineFunction &MF) const { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); @@ -638,12 +652,21 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)), BytesInStackArgArea(MFI.getBytesInStackArgArea()), ReturnsVoid(MFI.returnsVoid()), - ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), Mode(MFI.getMode()) { + ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), + PSInputAddr(MFI.getPSInputAddr()), + PSInputEnable(MFI.getPSInputEnable()), + Mode(MFI.getMode()) { for (Register Reg : MFI.getWWMReservedRegs()) WWMReservedRegs.push_back(regToString(Reg, TRI)); + if (MFI.getLongBranchReservedReg()) + LongBranchReservedReg = regToString(MFI.getLongBranchReservedReg(), TRI); if (MFI.getVGPRForAGPRCopy()) VGPRForAGPRCopy = regToString(MFI.getVGPRForAGPRCopy(), TRI); + + if (MFI.getSGPRForEXECCopy()) + SGPRForEXECCopy = regToString(MFI.getSGPRForEXECCopy(), TRI); + auto SFI = MFI.getOptionalScavengeFI(); if (SFI) ScavengeFI = yaml::FrameIndex(*SFI, MF.getFrameInfo()); @@ -661,6 +684,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields( LDSSize = YamlMFI.LDSSize; GDSSize = YamlMFI.GDSSize; DynLDSAlign = YamlMFI.DynLDSAlign; + PSInputAddr = YamlMFI.PSInputAddr; + PSInputEnable = YamlMFI.PSInputEnable; HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress; Occupancy = YamlMFI.Occupancy; IsEntryFunction = YamlMFI.IsEntryFunction; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index c869ee875711..37572d30dff6 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -18,6 +18,7 @@ #include "AMDGPUTargetMachine.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" +#include "SIModeRegisterDefaults.h" #include "llvm/ADT/SetVector.h" #include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/PseudoSourceValue.h" @@ -215,7 +216,7 @@ struct SIMode { SIMode() = default; - SIMode(const AMDGPU::SIModeRegisterDefaults &Mode) { + SIMode(const SIModeRegisterDefaults &Mode) { IEEE = Mode.IEEE; DX10Clamp = Mode.DX10Clamp; FP32InputDenormals = Mode.FP32Denormals.Input != DenormalMode::PreserveSign; @@ -275,9 +276,15 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { bool ReturnsVoid = true; std::optional<SIArgumentInfo> ArgInfo; + + unsigned PSInputAddr = 0; + unsigned PSInputEnable = 0; + SIMode Mode; std::optional<FrameIndex> ScavengeFI; StringValue VGPRForAGPRCopy; + StringValue SGPRForEXECCopy; + StringValue LongBranchReservedReg; SIMachineFunctionInfo() = default; SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &, @@ -311,6 +318,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { YamlIO.mapOptional("bytesInStackArgArea", MFI.BytesInStackArgArea, 0u); YamlIO.mapOptional("returnsVoid", MFI.ReturnsVoid, true); YamlIO.mapOptional("argumentInfo", MFI.ArgInfo); + YamlIO.mapOptional("psInputAddr", MFI.PSInputAddr, 0u); + YamlIO.mapOptional("psInputEnable", MFI.PSInputEnable, 0u); YamlIO.mapOptional("mode", MFI.Mode, SIMode()); YamlIO.mapOptional("highBitsOf32BitAddress", MFI.HighBitsOf32BitAddress, 0u); @@ -319,6 +328,10 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { YamlIO.mapOptional("scavengeFI", MFI.ScavengeFI); YamlIO.mapOptional("vgprForAGPRCopy", MFI.VGPRForAGPRCopy, StringValue()); // Don't print out when it's empty. + YamlIO.mapOptional("sgprForEXECCopy", MFI.SGPRForEXECCopy, + StringValue()); // Don't print out when it's empty. + YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg, + StringValue()); } }; @@ -355,11 +368,12 @@ public: /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. -class SIMachineFunctionInfo final : public AMDGPUMachineFunction { +class SIMachineFunctionInfo final : public AMDGPUMachineFunction, + private MachineRegisterInfo::Delegate { friend class GCNTargetMachine; // State of MODE register, assumed FP mode. - AMDGPU::SIModeRegisterDefaults Mode; + SIModeRegisterDefaults Mode; // Registers that may be reserved for spilling purposes. These may be the same // as the input registers. @@ -374,6 +388,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { // base to the beginning of the new function's frame. Register StackPtrOffsetReg = AMDGPU::SP_REG; + // Registers that may be reserved when RA doesn't allocate enough + // registers to plan for the case where an indirect branch ends up + // being needed during branch relaxation. + Register LongBranchReservedReg; + AMDGPUFunctionArgInfo ArgInfo; // Graphics info. @@ -453,6 +472,9 @@ private: unsigned HighBitsOf32BitAddress; + // Flags associated with the virtual registers. + IndexedMap<uint8_t, VirtReg2IndexFunctor> VRegFlags; + // Current recorded maximum possible occupancy. unsigned Occupancy; @@ -462,6 +484,10 @@ private: MCPhysReg getNextSystemSGPR() const; + // MachineRegisterInfo callback functions to notify events. + void MRI_NoteNewVirtualRegister(Register Reg) override; + void MRI_NoteCloneVirtualRegister(Register NewReg, Register SrcReg) override; + public: struct VGPRSpillToAGPR { SmallVector<MCPhysReg, 32> Lanes; @@ -470,15 +496,16 @@ public: }; private: - // To track VGPR + lane index for each subregister of the SGPR spilled to - // frameindex key during SILowerSGPRSpills pass. - DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>> SGPRSpillToVGPRLanes; - // To track VGPR + lane index for spilling special SGPRs like Frame Pointer - // identified during PrologEpilogInserter. + // To track virtual VGPR + lane index for each subregister of the SGPR spilled + // to frameindex key during SILowerSGPRSpills pass. + DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>> + SGPRSpillsToVirtualVGPRLanes; + // To track physical VGPR + lane index for CSR SGPR spills and special SGPRs + // like Frame Pointer identified during PrologEpilogInserter. DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>> - PrologEpilogSGPRSpillToVGPRLanes; - unsigned NumVGPRSpillLanes = 0; - unsigned NumVGPRPrologEpilogSpillLanes = 0; + SGPRSpillsToPhysicalVGPRLanes; + unsigned NumVirtualVGPRSpillLanes = 0; + unsigned NumPhysicalVGPRSpillLanes = 0; SmallVector<Register, 2> SpillVGPRs; using WWMSpillsMap = MapVector<Register, int>; // To track the registers used in instructions that can potentially modify the @@ -504,6 +531,9 @@ private: // PrologEpilogInserter. PrologEpilogSGPRSpillsMap PrologEpilogSGPRSpills; + // To save/restore EXEC MASK around WWM spills and copies. + Register SGPRForEXECCopy; + DenseMap<int, VGPRSpillToAGPR> VGPRToAGPRSpills; // AGPRs used for VGPR spills. @@ -519,10 +549,10 @@ private: private: Register VGPRForAGPRCopy; - bool allocateVGPRForSGPRSpills(MachineFunction &MF, int FI, - unsigned LaneIndex); - bool allocateVGPRForPrologEpilogSGPRSpills(MachineFunction &MF, int FI, - unsigned LaneIndex); + bool allocateVirtualVGPRForSGPRSpills(MachineFunction &MF, int FI, + unsigned LaneIndex); + bool allocatePhysicalVGPRForSGPRSpills(MachineFunction &MF, int FI, + unsigned LaneIndex); public: Register getVGPRForAGPRCopy() const { @@ -551,14 +581,12 @@ public: void reserveWWMRegister(Register Reg) { WWMReservedRegs.insert(Reg); } - AMDGPU::SIModeRegisterDefaults getMode() const { - return Mode; - } + SIModeRegisterDefaults getMode() const { return Mode; } ArrayRef<SIRegisterInfo::SpilledReg> - getSGPRSpillToVGPRLanes(int FrameIndex) const { - auto I = SGPRSpillToVGPRLanes.find(FrameIndex); - return (I == SGPRSpillToVGPRLanes.end()) + getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const { + auto I = SGPRSpillsToVirtualVGPRLanes.find(FrameIndex); + return (I == SGPRSpillsToVirtualVGPRLanes.end()) ? ArrayRef<SIRegisterInfo::SpilledReg>() : ArrayRef(I->second); } @@ -579,7 +607,7 @@ public: // Check if an entry created for \p Reg in PrologEpilogSGPRSpills. Return true // on success and false otherwise. bool hasPrologEpilogSGPRSpillEntry(Register Reg) const { - return PrologEpilogSGPRSpills.find(Reg) != PrologEpilogSGPRSpills.end(); + return PrologEpilogSGPRSpills.contains(Reg); } // Get the scratch SGPR if allocated to save/restore \p Reg. @@ -620,13 +648,28 @@ public: } ArrayRef<SIRegisterInfo::SpilledReg> - getPrologEpilogSGPRSpillToVGPRLanes(int FrameIndex) const { - auto I = PrologEpilogSGPRSpillToVGPRLanes.find(FrameIndex); - return (I == PrologEpilogSGPRSpillToVGPRLanes.end()) + getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const { + auto I = SGPRSpillsToPhysicalVGPRLanes.find(FrameIndex); + return (I == SGPRSpillsToPhysicalVGPRLanes.end()) ? ArrayRef<SIRegisterInfo::SpilledReg>() : ArrayRef(I->second); } + void setFlag(Register Reg, uint8_t Flag) { + assert(Reg.isVirtual()); + if (VRegFlags.inBounds(Reg)) + VRegFlags[Reg] |= Flag; + } + + bool checkFlag(Register Reg, uint8_t Flag) const { + if (Reg.isPhysical()) + return false; + + return VRegFlags.inBounds(Reg) && VRegFlags[Reg] & Flag; + } + + bool hasVRegFlags() { return VRegFlags.size(); } + void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size = 4, Align Alignment = Align(4)); @@ -639,6 +682,10 @@ public: return SpillAGPR; } + Register getSGPRForEXECCopy() const { return SGPRForEXECCopy; } + + void setSGPRForEXECCopy(Register Reg) { SGPRForEXECCopy = Reg; } + ArrayRef<MCPhysReg> getVGPRSpillAGPRs() const { return SpillVGPR; } @@ -693,21 +740,35 @@ public: } // Add system SGPRs. - Register addWorkGroupIDX() { - ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR()); - NumSystemSGPRs += 1; + Register addWorkGroupIDX(bool HasArchitectedSGPRs) { + Register Reg = + HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP9 : getNextSystemSGPR(); + ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(Reg); + if (!HasArchitectedSGPRs) + NumSystemSGPRs += 1; + return ArgInfo.WorkGroupIDX.getRegister(); } - Register addWorkGroupIDY() { - ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR()); - NumSystemSGPRs += 1; + Register addWorkGroupIDY(bool HasArchitectedSGPRs) { + Register Reg = + HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR(); + unsigned Mask = HasArchitectedSGPRs && hasWorkGroupIDZ() ? 0xffff : ~0u; + ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(Reg, Mask); + if (!HasArchitectedSGPRs) + NumSystemSGPRs += 1; + return ArgInfo.WorkGroupIDY.getRegister(); } - Register addWorkGroupIDZ() { - ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR()); - NumSystemSGPRs += 1; + Register addWorkGroupIDZ(bool HasArchitectedSGPRs) { + Register Reg = + HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR(); + unsigned Mask = HasArchitectedSGPRs ? 0xffff << 16 : ~0u; + ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(Reg, Mask); + if (!HasArchitectedSGPRs) + NumSystemSGPRs += 1; + return ArgInfo.WorkGroupIDZ.getRegister(); } @@ -872,6 +933,8 @@ public: StackPtrOffsetReg = Reg; } + void setLongBranchReservedReg(Register Reg) { LongBranchReservedReg = Reg; } + // Note the unset value for this is AMDGPU::SP_REG rather than // NoRegister. This is mostly a workaround for MIR tests where state that // can't be directly computed from the function is not preserved in serialized @@ -880,6 +943,8 @@ public: return StackPtrOffsetReg; } + Register getLongBranchReservedReg() const { return LongBranchReservedReg; } + Register getQueuePtrUserSGPR() const { return ArgInfo.QueuePtr.getRegister(); } diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp index 6d901d6783f0..677f1590287e 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -1883,7 +1883,7 @@ void SIScheduleDAGMI::schedule() LLVM_DEBUG(dbgs() << "Preparing Scheduling\n"); buildDAGWithRegPressure(); - postprocessDAG(); + postProcessDAG(); LLVM_DEBUG(dump()); if (PrintDAGs) diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 5f2707317984..bc48f7b76c6d 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -22,7 +22,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/Support/AtomicOrdering.h" -#include "llvm/Support/TargetParser.h" +#include "llvm/TargetParser/TargetParser.h" using namespace llvm; using namespace llvm::AMDGPU; @@ -351,6 +351,10 @@ public: /// Virtual destructor to allow derivations to be deleted. virtual ~SICacheControl() = default; + virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, + MachineBasicBlock::iterator &MI) const { + return false; + } }; class SIGfx6CacheControl : public SICacheControl { @@ -509,6 +513,20 @@ public: bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, Position Pos) const override; + + bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, + MachineBasicBlock::iterator &MI) const override { + bool Changed = false; + if (ST.hasForceStoreSC0SC1() && + (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH | + SIAtomicAddrSpace::GLOBAL | + SIAtomicAddrSpace::OTHER)) != + SIAtomicAddrSpace::NONE) { + Changed |= enableSC0Bit(MI); + Changed |= enableSC1Bit(MI); + } + return Changed; + } }; class SIGfx10CacheControl : public SIGfx7CacheControl { @@ -2209,8 +2227,13 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, bool Changed = false; if (MOI.isAtomic()) { - if (MOI.getOrdering() == AtomicOrdering::Acquire || - MOI.getOrdering() == AtomicOrdering::Release || + if (MOI.getOrdering() == AtomicOrdering::Acquire) + Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(), + SIMemOp::LOAD | SIMemOp::STORE, + MOI.getIsCrossAddressSpaceOrdering(), + Position::BEFORE); + + if (MOI.getOrdering() == AtomicOrdering::Release || MOI.getOrdering() == AtomicOrdering::AcquireRelease || MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) /// TODO: This relies on a barrier always generating a waitcnt @@ -2319,9 +2342,10 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { if (const auto &MOI = MOA.getLoadInfo(MI)) Changed |= expandLoad(*MOI, MI); - else if (const auto &MOI = MOA.getStoreInfo(MI)) + else if (const auto &MOI = MOA.getStoreInfo(MI)) { Changed |= expandStore(*MOI, MI); - else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) + Changed |= CC->tryForceStoreSC0SC1(*MOI, MI); + } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) Changed |= expandAtomicFence(*MOI, MI); else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI); diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp index 0d48c3159c6f..be395d53c34e 100644 --- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp +++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp @@ -29,10 +29,10 @@ using namespace llvm; struct Status { // Mask is a bitmask where a '1' indicates the corresponding Mode bit has a // known value - unsigned Mask; - unsigned Mode; + unsigned Mask = 0; + unsigned Mode = 0; - Status() : Mask(0), Mode(0){}; + Status() = default; Status(unsigned NewMask, unsigned NewMode) : Mask(NewMask), Mode(NewMode) { Mode &= Mask; @@ -96,13 +96,13 @@ public: // In Phase 1 we record the first instruction that has a mode requirement, // which is used in Phase 3 if we need to insert a mode change. - MachineInstr *FirstInsertionPoint; + MachineInstr *FirstInsertionPoint = nullptr; // A flag to indicate whether an Exit value has been set (we can't tell by // examining the Exit value itself as all values may be valid results). - bool ExitSet; + bool ExitSet = false; - BlockData() : FirstInsertionPoint(nullptr), ExitSet(false){}; + BlockData() = default; }; namespace { @@ -222,8 +222,8 @@ Status SIModeRegister::getInstructionMode(MachineInstr &MI, void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI, const SIInstrInfo *TII, Status InstrMode) { while (InstrMode.Mask) { - unsigned Offset = countTrailingZeros<unsigned>(InstrMode.Mask); - unsigned Width = countTrailingOnes<unsigned>(InstrMode.Mask >> Offset); + unsigned Offset = llvm::countr_zero<unsigned>(InstrMode.Mask); + unsigned Width = llvm::countr_one<unsigned>(InstrMode.Mask >> Offset); unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1); BuildMI(MBB, MI, nullptr, TII->get(AMDGPU::S_SETREG_IMM32_B32)) .addImm(Value) diff --git a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp new file mode 100644 index 000000000000..413ef5d162a7 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp @@ -0,0 +1,38 @@ +//===-- SIModeRegisterDefaults.cpp ------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "SIModeRegisterDefaults.h" + +using namespace llvm; + +SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) { + *this = getDefaultForCallingConv(F.getCallingConv()); + + StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString(); + if (!IEEEAttr.empty()) + IEEE = IEEEAttr == "true"; + + StringRef DX10ClampAttr = + F.getFnAttribute("amdgpu-dx10-clamp").getValueAsString(); + if (!DX10ClampAttr.empty()) + DX10Clamp = DX10ClampAttr == "true"; + + StringRef DenormF32Attr = + F.getFnAttribute("denormal-fp-math-f32").getValueAsString(); + if (!DenormF32Attr.empty()) + FP32Denormals = parseDenormalFPAttribute(DenormF32Attr); + + StringRef DenormAttr = + F.getFnAttribute("denormal-fp-math").getValueAsString(); + if (!DenormAttr.empty()) { + DenormalMode DenormMode = parseDenormalFPAttribute(DenormAttr); + if (DenormF32Attr.empty()) + FP32Denormals = DenormMode; + FP64FP16Denormals = DenormMode; + } +} diff --git a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h new file mode 100644 index 000000000000..df2e3f9bff32 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h @@ -0,0 +1,90 @@ +//===-- SIModeRegisterDefaults.h --------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_SIMODEREGISTERDEFAULTS_H +#define LLVM_LIB_TARGET_AMDGPU_SIMODEREGISTERDEFAULTS_H + +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/FloatingPointMode.h" + +namespace llvm { + +// Track defaults for fields in the MODE register. +struct SIModeRegisterDefaults { + /// Floating point opcodes that support exception flag gathering quiet and + /// propagate signaling NaN inputs per IEEE 754-2008. Min_dx10 and max_dx10 + /// become IEEE 754- 2008 compliant due to signaling NaN propagation and + /// quieting. + bool IEEE : 1; + + /// Used by the vector ALU to force DX10-style treatment of NaNs: when set, + /// clamp NaN to zero; otherwise, pass NaN through. + bool DX10Clamp : 1; + + /// If this is set, neither input or output denormals are flushed for most f32 + /// instructions. + DenormalMode FP32Denormals; + + /// If this is set, neither input or output denormals are flushed for both f64 + /// and f16/v2f16 instructions. + DenormalMode FP64FP16Denormals; + + SIModeRegisterDefaults() : + IEEE(true), + DX10Clamp(true), + FP32Denormals(DenormalMode::getIEEE()), + FP64FP16Denormals(DenormalMode::getIEEE()) {} + + SIModeRegisterDefaults(const Function &F); + + static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) { + SIModeRegisterDefaults Mode; + Mode.IEEE = !AMDGPU::isShader(CC); + return Mode; + } + + bool operator==(const SIModeRegisterDefaults Other) const { + return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp && + FP32Denormals == Other.FP32Denormals && + FP64FP16Denormals == Other.FP64FP16Denormals; + } + + /// Get the encoding value for the FP_DENORM bits of the mode register for the + /// FP32 denormal mode. + uint32_t fpDenormModeSPValue() const { + if (FP32Denormals == DenormalMode::getPreserveSign()) + return FP_DENORM_FLUSH_IN_FLUSH_OUT; + if (FP32Denormals.Output == DenormalMode::PreserveSign) + return FP_DENORM_FLUSH_OUT; + if (FP32Denormals.Input == DenormalMode::PreserveSign) + return FP_DENORM_FLUSH_IN; + return FP_DENORM_FLUSH_NONE; + } + + /// Get the encoding value for the FP_DENORM bits of the mode register for the + /// FP64/FP16 denormal mode. + uint32_t fpDenormModeDPValue() const { + if (FP64FP16Denormals == DenormalMode::getPreserveSign()) + return FP_DENORM_FLUSH_IN_FLUSH_OUT; + if (FP64FP16Denormals.Output == DenormalMode::PreserveSign) + return FP_DENORM_FLUSH_OUT; + if (FP64FP16Denormals.Input == DenormalMode::PreserveSign) + return FP_DENORM_FLUSH_IN; + return FP_DENORM_FLUSH_NONE; + } + + // FIXME: Inlining should be OK for dx10-clamp, since the caller's mode should + // be able to override. + bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const { + return DX10Clamp == CalleeMode.DX10Clamp && IEEE == CalleeMode.IEEE; + } +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_SIMODEREGISTERDEFAULTS_H diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index 85de3a548411..d2a5eb89da12 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -96,8 +96,8 @@ static bool isDefBetween(const SIRegisterInfo &TRI, if (Reg.isVirtual()) return isDefBetween(LIS->getInterval(Reg), AndIdx, SelIdx); - for (MCRegUnitIterator UI(Reg.asMCReg(), &TRI); UI.isValid(); ++UI) { - if (isDefBetween(LIS->getRegUnit(*UI), AndIdx, SelIdx)) + for (MCRegUnit Unit : TRI.regunits(Reg.asMCReg())) { + if (isDefBetween(LIS->getRegUnit(Unit), AndIdx, SelIdx)) return true; } @@ -106,7 +106,7 @@ static bool isDefBetween(const SIRegisterInfo &TRI, // Optimize sequence // %sel = V_CNDMASK_B32_e64 0, 1, %cc -// %cmp = V_CMP_NE_U32 1, %1 +// %cmp = V_CMP_NE_U32 1, %sel // $vcc = S_AND_B64 $exec, %cmp // S_CBRANCH_VCC[N]Z // => @@ -218,46 +218,11 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) { // and their associated liveness information. SlotIndex CmpIdx = LIS->getInstructionIndex(*Cmp); if (CCReg.isVirtual()) { - // Apply live ranges from SelLI to CCReg potentially matching splits - // and extending to loop boundaries. - - auto applyLiveRanges = [&](LiveRange &Dst, VNInfo *VNI) { - // Copy live ranges from SelLI, adjusting start and end as required - auto DefSegment = SelLI->FindSegmentContaining(SelIdx.getRegSlot()); - assert(DefSegment != SelLI->end() && - "No live interval segment covering definition?"); - for (auto I = DefSegment; I != SelLI->end() && I->start <= AndIdx; ++I) { - SlotIndex Start = I->start < SelIdx.getRegSlot() ? - SelIdx.getRegSlot() : I->start; - SlotIndex End = I->end < AndIdx.getRegSlot() || I->end.isBlock() ? - I->end : AndIdx.getRegSlot(); - Dst.addSegment(LiveRange::Segment(Start, End, VNI)); - } - // If SelLI does not cover AndIdx (because Cmp killed Sel) then extend. - if (!SelLI->getSegmentContaining(AndIdx.getRegSlot())) - Dst.addSegment(LiveRange::Segment(CmpIdx.getRegSlot(), AndIdx.getRegSlot(), VNI)); - }; - LiveInterval &CCLI = LIS->getInterval(CCReg); auto CCQ = CCLI.Query(SelIdx.getRegSlot()); - if (CCQ.valueIn()) - applyLiveRanges(CCLI, CCQ.valueIn()); - - if (CC->getSubReg()) { - LaneBitmask Mask = TRI->getSubRegIndexLaneMask(CC->getSubReg()); - BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator(); - CCLI.refineSubRanges( - Allocator, Mask, - [=](LiveInterval::SubRange &SR) { - auto CCQS = SR.Query(SelIdx.getRegSlot()); - if (CCQS.valueIn()) - applyLiveRanges(SR, CCQS.valueIn()); - }, - *LIS->getSlotIndexes(), *TRI); - CCLI.removeEmptySubRanges(); - - SmallVector<LiveInterval *> SplitLIs; - LIS->splitSeparateComponents(CCLI, SplitLIs); + if (CCQ.valueIn()) { + LIS->removeInterval(CCReg); + LIS->createAndComputeVirtRegInterval(CCReg); } } else LIS->removeAllRegUnitsForPhysReg(CCReg); @@ -287,7 +252,13 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) { LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot()); LIS->RemoveMachineInstrFromMaps(*Sel); + bool ShrinkSel = Sel->getOperand(0).readsReg(); Sel->eraseFromParent(); + if (ShrinkSel) { + // The result of the V_CNDMASK was a subreg def which counted as a read + // from the other parts of the reg. Shrink their live ranges. + LIS->shrinkToUses(SelLI); + } } } @@ -349,8 +320,8 @@ bool SIOptimizeExecMaskingPreRA::optimizeElseBranch(MachineBasicBlock &MBB) { // Instead just check that the def segments are adjacent. SlotIndex StartIdx = LIS->getInstructionIndex(SaveExecMI); SlotIndex EndIdx = LIS->getInstructionIndex(*AndExecMI); - for (MCRegUnitIterator UI(ExecReg, TRI); UI.isValid(); ++UI) { - LiveRange &RegUnit = LIS->getRegUnit(*UI); + for (MCRegUnit Unit : TRI->regunits(ExecReg)) { + LiveRange &RegUnit = LIS->getRegUnit(Unit); if (RegUnit.find(StartIdx) != std::prev(RegUnit.find(EndIdx))) return false; } diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp index ae2c10116de8..e95abae88d7a 100644 --- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp +++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp @@ -357,8 +357,8 @@ void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters( for (auto *I : Instructions) { auto &MI = *I; - for (auto &MO : MI.operands()) { - if (!MO.isReg() || !MO.getReg() || MO.isDef()) + for (auto &MO : MI.all_uses()) { + if (!MO.getReg()) continue; Register MOReg = MO.getReg(); @@ -522,8 +522,15 @@ void SIOptimizeVGPRLiveRange::optimizeLiveRange( auto *UseBlock = UseMI->getParent(); // Replace uses in Endif block if (UseBlock == Endif) { - assert(UseMI->isPHI() && "Uses should be PHI in Endif block"); - O.setReg(NewReg); + if (UseMI->isPHI()) { + O.setReg(NewReg); + } else { + // DetectDeadLanes may mark register uses as undef without removing + // them, in which case a non-phi instruction using the original register + // may exist in the Endif block even though the register is not live + // into it. + assert(!O.readsReg()); + } continue; } diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index c21ff06454da..97b3161c7f98 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -759,7 +759,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { break; SdwaSel DstSel = static_cast<SdwaSel>( - TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));; + TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel)); SdwaSel OtherDstSel = static_cast<SdwaSel>( TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel)); @@ -1158,7 +1158,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) continue; - unsigned I = MI.getOperandNo(&Op); + unsigned I = Op.getOperandNo(); if (Desc.operands()[I].RegClass == -1 || !TRI->isVSSuperClass(TRI->getRegClass(Desc.operands()[I].RegClass))) continue; diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp index 8553a0ab2a68..8464cb3d6fc4 100644 --- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp +++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp @@ -101,8 +101,8 @@ void SIPostRABundler::collectUsedRegUnits(const MachineInstr &MI, assert(!Op.getSubReg() && "subregister indexes should not be present after RA"); - for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) - UsedRegUnits.set(*Units); + for (MCRegUnit Unit : TRI->regunits(Reg)) + UsedRegUnits.set(Unit); } } diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp index 877c8b81b2c0..b6839c8308d8 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp @@ -54,3 +54,23 @@ uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC) const { } return Reg; } + +uint64_t SIProgramInfo::getComputePGMRSrc2() const { + uint64_t Reg = + S_00B84C_SCRATCH_EN(ScratchEnable) | S_00B84C_USER_SGPR(UserSGPR) | + S_00B84C_TRAP_HANDLER(TrapHandlerEnable) | + S_00B84C_TGID_X_EN(TGIdXEnable) | S_00B84C_TGID_Y_EN(TGIdYEnable) | + S_00B84C_TGID_Z_EN(TGIdZEnable) | S_00B84C_TG_SIZE_EN(TGSizeEnable) | + S_00B84C_TIDIG_COMP_CNT(TIdIGCompCount) | + S_00B84C_EXCP_EN_MSB(EXCPEnMSB) | S_00B84C_LDS_SIZE(LdsSize) | + S_00B84C_EXCP_EN(EXCPEnable); + + return Reg; +} + +uint64_t SIProgramInfo::getPGMRSrc2(CallingConv::ID CC) const { + if (AMDGPU::isCompute(CC)) + return getComputePGMRSrc2(); + + return 0; +} diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h index 553fb4cf496c..aab127e49463 100644 --- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h +++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h @@ -36,11 +36,23 @@ struct SIProgramInfo { uint32_t MemOrdered = 0; // GFX10+ uint64_t ScratchSize = 0; - // Fields set in PGM_RSRC2 pm4 packet. + // State used to calculate fields set in PGM_RSRC2 pm4 packet. uint32_t LDSBlocks = 0; uint32_t ScratchBlocks = 0; - uint64_t ComputePGMRSrc2 = 0; + // Fields set in PGM_RSRC2 pm4 packet + uint32_t ScratchEnable = 0; + uint32_t UserSGPR = 0; + uint32_t TrapHandlerEnable = 0; + uint32_t TGIdXEnable = 0; + uint32_t TGIdYEnable = 0; + uint32_t TGIdZEnable = 0; + uint32_t TGSizeEnable = 0; + uint32_t TIdIGCompCount = 0; + uint32_t EXCPEnMSB = 0; + uint32_t LdsSize = 0; + uint32_t EXCPEnable = 0; + uint64_t ComputePGMRSrc3GFX90A = 0; uint32_t NumVGPR = 0; @@ -75,6 +87,10 @@ struct SIProgramInfo { /// Compute the value of the ComputePGMRsrc1 register. uint64_t getComputePGMRSrc1() const; uint64_t getPGMRSrc1(CallingConv::ID CC) const; + + /// Compute the value of the ComputePGMRsrc2 register. + uint64_t getComputePGMRSrc2() const; + uint64_t getPGMRSrc2(CallingConv::ID CC) const; }; } // namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index f7ce581f9736..1d50dff4a7d9 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -31,7 +31,7 @@ using namespace llvm; static cl::opt<bool> EnableSpillSGPRToVGPR( "amdgpu-spill-sgpr-to-vgpr", - cl::desc("Enable spilling VGPRs to SGPRs"), + cl::desc("Enable spilling SGPRs to VGPRs"), cl::ReallyHidden, cl::init(true)); @@ -170,7 +170,8 @@ struct SGPRSpillBuilder { // a register as actually in use in another lane, so we need to save all // used lanes of the chosen VGPR. assert(RS && "Cannot spill SGPR to memory without RegScavenger"); - TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0, false); + TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, + 0, false); // Reserve temporary stack slot TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI); @@ -199,7 +200,7 @@ struct SGPRSpillBuilder { const TargetRegisterClass &RC = IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass; RS->setRegUsed(SuperReg); - SavedExecReg = RS->scavengeRegister(&RC, MI, 0, false); + SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false); int64_t VGPRLanes = getPerVGPRData().VGPRLanes; @@ -328,10 +329,9 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) "getNumCoveredRegs() will not work with generated subreg masks!"); RegPressureIgnoredUnits.resize(getNumRegUnits()); - RegPressureIgnoredUnits.set( - *MCRegUnitIterator(MCRegister::from(AMDGPU::M0), this)); + RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin()); for (auto Reg : AMDGPU::VGPR_HI16RegClass) - RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this)); + RegPressureIgnoredUnits.set(*regunits(Reg).begin()); // HACK: Until this is fully tablegen'd. static llvm::once_flag InitializeRegSplitPartsFlag; @@ -380,9 +380,7 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, MCRegister Reg) const { - MCRegAliasIterator R(Reg, this, true); - - for (; R.isValid(); ++R) + for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R) Reserved.set(*R); } @@ -535,11 +533,18 @@ unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel, return SubRegFromChannelTable[NumRegIndex - 1][Channel]; } +MCRegister +SIRegisterInfo::getAlignedHighSGPRForRC(const MachineFunction &MF, + const unsigned Align, + const TargetRegisterClass *RC) const { + unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align; + MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); + return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC); +} + MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg( const MachineFunction &MF) const { - unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; - MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); - return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); + return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass); } BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { @@ -609,14 +614,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, Reg); } - for (auto Reg : AMDGPU::SReg_32RegClass) { - Reserved.set(getSubReg(Reg, AMDGPU::hi16)); - Register Low = getSubReg(Reg, AMDGPU::lo16); - // This is to prevent BB vcc liveness errors. - if (!AMDGPU::SGPR_LO16RegClass.contains(Low)) - Reserved.set(Low); - } - Register ScratchRSrcReg = MFI->getScratchRSrcReg(); if (ScratchRSrcReg != AMDGPU::NoRegister) { // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we @@ -625,6 +622,10 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, ScratchRSrcReg); } + Register LongBranchReservedReg = MFI->getLongBranchReservedReg(); + if (LongBranchReservedReg) + reserveRegisterTuples(Reserved, LongBranchReservedReg); + // We have to assume the SP is needed in case there are calls in the function, // which is detected after the function is lowered. If we aren't really going // to need SP, don't bother reserving it. @@ -646,24 +647,18 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); } + // FIXME: Use same reserved register introduced in D149775 + // SGPR used to preserve EXEC MASK around WWM spill/copy instructions. + Register ExecCopyReg = MFI->getSGPRForEXECCopy(); + if (ExecCopyReg) + reserveRegisterTuples(Reserved, ExecCopyReg); + // Reserve VGPRs/AGPRs. // unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); unsigned MaxNumAGPRs = MaxNumVGPRs; unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); - // Reserve all the AGPRs if there are no instructions to use it. - if (!ST.hasMAIInsts()) { - for (unsigned i = 0; i < MaxNumAGPRs; ++i) { - unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); - reserveRegisterTuples(Reserved, Reg); - } - } - - for (auto Reg : AMDGPU::AGPR_32RegClass) { - Reserved.set(getSubReg(Reg, AMDGPU::hi16)); - } - // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically, // a wave may have up to 512 total vector registers combining together both // VGPRs and AGPRs. Hence, in an entry function without calls and without @@ -690,9 +685,15 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, Reg); } - for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) { - unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); - reserveRegisterTuples(Reserved, Reg); + if (ST.hasMAIInsts()) { + for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) { + unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); + } + } else { + // Reserve all the AGPRs if there are no instructions to use it. + for (MCRegister Reg : AMDGPU::AGPR_32RegClass) + reserveRegisterTuples(Reserved, Reg); } // On GFX908, in order to guarantee copying between AGPRs, we need a scratch @@ -711,9 +712,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) reserveRegisterTuples(Reserved, Reg); - for (auto Reg : MFI->getSGPRSpillVGPRs()) - reserveRegisterTuples(Reserved, Reg); - return Reserved; } @@ -1065,6 +1063,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_A32_RESTORE: case AMDGPU::SI_SPILL_AV32_SAVE: case AMDGPU::SI_SPILL_AV32_RESTORE: + case AMDGPU::SI_SPILL_WWM_V32_SAVE: + case AMDGPU::SI_SPILL_WWM_V32_RESTORE: return 1; default: llvm_unreachable("Invalid spill opcode"); } @@ -1326,7 +1326,7 @@ void SIRegisterInfo::buildSpillLoadStore( const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores. const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC); - const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8; + const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8; // Always use 4 byte operations for AGPRs because we need to scavenge // a temporary VGPR. @@ -1607,7 +1607,8 @@ void SIRegisterInfo::buildSpillLoadStore( } else if (UseVGPROffset) { // FIXME: change to scavengeRegisterBackwards() if (!TmpOffsetVGPR) { - TmpOffsetVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, + MI, false, 0); RS->setRegUsed(TmpOffsetVGPR); } } @@ -1660,6 +1661,33 @@ void SIRegisterInfo::buildSpillLoadStore( if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg)) MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); + + // The epilog restore of a wwm-scratch register can cause undesired + // optimization during machine-cp post PrologEpilogInserter if the same + // register was assigned for return value ABI lowering with a COPY + // instruction. As given below, with the epilog reload, the earlier COPY + // appeared to be dead during machine-cp. + // ... + // v0 in WWM operation, needs the WWM spill at prolog/epilog. + // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0 + // ... + // Epilog block: + // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0 + // ... + // WWM spill restore to preserve the inactive lanes of v0. + // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1 + // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0 + // $exec = S_MOV_B64 killed $sgpr4_sgpr5 + // ... + // SI_RETURN implicit $vgpr0 + // ... + // To fix it, mark the same reg as a tied op for such restore instructions + // so that it marks a usage for the preceding COPY. + if (!IsStore && MI != MBB.end() && MI->isReturn() && + MI->readsRegister(SubReg, this)) { + MIB.addReg(SubReg, RegState::Implicit); + MIB->tieOperands(0, MIB->getNumOperands() - 1); + } } if (ScratchOffsetRegDelta != 0) { @@ -1705,10 +1733,13 @@ void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, SlotIndexes *Indexes, - LiveIntervals *LIS, bool OnlyToVGPR) const { + LiveIntervals *LIS, bool OnlyToVGPR, + bool SpillToPhysVGPRLane) const { SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); - ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index); + ArrayRef<SpilledReg> VGPRSpills = + SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index) + : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index); bool SpillToVGPR = !VGPRSpills.empty(); if (OnlyToVGPR && !SpillToVGPR) return false; @@ -1825,10 +1856,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, SlotIndexes *Indexes, - LiveIntervals *LIS, bool OnlyToVGPR) const { + LiveIntervals *LIS, bool OnlyToVGPR, + bool SpillToPhysVGPRLane) const { SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); - ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index); + ArrayRef<SpilledReg> VGPRSpills = + SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index) + : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index); bool SpillToVGPR = !VGPRSpills.empty(); if (OnlyToVGPR && !SpillToVGPR) return false; @@ -1974,7 +2008,7 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI, /// handled. bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, - SlotIndexes *Indexes, LiveIntervals *LIS) const { + SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const { switch (MI->getOpcode()) { case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: @@ -1990,7 +2024,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: - return spillSGPR(MI, FI, RS, Indexes, LIS, true); + return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane); case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S384_RESTORE: @@ -2005,7 +2039,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: - return restoreSGPR(MI, FI, RS, Indexes, LIS, true); + return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane); default: llvm_unreachable("not an SGPR spill instruction"); } @@ -2109,7 +2143,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_AV128_SAVE: case AMDGPU::SI_SPILL_AV96_SAVE: case AMDGPU::SI_SPILL_AV64_SAVE: - case AMDGPU::SI_SPILL_AV32_SAVE: { + case AMDGPU::SI_SPILL_AV32_SAVE: + case AMDGPU::SI_SPILL_WWM_V32_SAVE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == @@ -2118,11 +2153,19 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR : AMDGPU::BUFFER_STORE_DWORD_OFFSET; auto *MBB = MI->getParent(); + bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); + if (IsWWMRegSpill) { + TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), + RS->isRegUsed(AMDGPU::SCC)); + } buildSpillLoadStore( *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); + if (IsWWMRegSpill) + TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); + MI->eraseFromParent(); return true; } @@ -2167,7 +2210,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_AV352_RESTORE: case AMDGPU::SI_SPILL_AV384_RESTORE: case AMDGPU::SI_SPILL_AV512_RESTORE: - case AMDGPU::SI_SPILL_AV1024_RESTORE: { + case AMDGPU::SI_SPILL_AV1024_RESTORE: + case AMDGPU::SI_SPILL_WWM_V32_RESTORE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == @@ -2176,10 +2220,19 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; auto *MBB = MI->getParent(); + bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); + if (IsWWMRegSpill) { + TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), + RS->isRegUsed(AMDGPU::SCC)); + } buildSpillLoadStore( *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), *MI->memoperands_begin(), RS); + + if (IsWWMRegSpill) + TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); + MI->eraseFromParent(); return true; } @@ -2271,7 +2324,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass; - Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR); + Register TmpReg = + RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR); FIOp.setReg(TmpReg); FIOp.setIsKill(); @@ -2291,8 +2345,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, Register TmpSReg = UseSGPR ? TmpReg - : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, - !UseSGPR); + : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, + MI, false, 0, !UseSGPR); // TODO: for flat scratch another attempt can be made with a VGPR index // if no SGPRs can be scavenged. @@ -2366,8 +2420,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, : &AMDGPU::VGPR_32RegClass; bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 || MI->getOpcode() == AMDGPU::V_MOV_B32_e64; - Register ResultReg = IsCopy ? MI->getOperand(0).getReg() - : RS->scavengeRegister(RC, MI, 0); + Register ResultReg = + IsCopy ? MI->getOperand(0).getReg() + : RS->scavengeRegisterBackwards(*RC, MI, false, 0); int64_t Offset = FrameInfo.getObjectOffset(Index); if (Offset == 0) { @@ -2380,8 +2435,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, if (IsSALU && !LiveSCC) Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead. if (IsSALU && LiveSCC) { - Register NewDest = - RS->scavengeRegister(&AMDGPU::SReg_32RegClass, Shift, 0); + Register NewDest = RS->scavengeRegisterBackwards( + AMDGPU::SReg_32RegClass, Shift, false, 0); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest) .addReg(ResultReg); @@ -2435,8 +2490,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, // We may have 1 free scratch SGPR even though a carry out is // unavailable. Only one additional mov is needed. - Register TmpScaledReg = - RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); + Register TmpScaledReg = RS->scavengeRegisterBackwards( + AMDGPU::SReg_32_XM0RegClass, MI, false, 0, false); Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) @@ -2501,7 +2556,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, FIOp.ChangeToImmediate(Offset); if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { - Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, + MI, false, 0); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addImm(Offset); FIOp.ChangeToRegister(TmpReg, false, false, true); @@ -2517,31 +2573,31 @@ StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const { static const TargetRegisterClass * getAnyVGPRClassForBitWidth(unsigned BitWidth) { - if (BitWidth <= 64) + if (BitWidth == 64) return &AMDGPU::VReg_64RegClass; - if (BitWidth <= 96) + if (BitWidth == 96) return &AMDGPU::VReg_96RegClass; - if (BitWidth <= 128) + if (BitWidth == 128) return &AMDGPU::VReg_128RegClass; - if (BitWidth <= 160) + if (BitWidth == 160) return &AMDGPU::VReg_160RegClass; - if (BitWidth <= 192) + if (BitWidth == 192) return &AMDGPU::VReg_192RegClass; - if (BitWidth <= 224) + if (BitWidth == 224) return &AMDGPU::VReg_224RegClass; - if (BitWidth <= 256) + if (BitWidth == 256) return &AMDGPU::VReg_256RegClass; - if (BitWidth <= 288) + if (BitWidth == 288) return &AMDGPU::VReg_288RegClass; - if (BitWidth <= 320) + if (BitWidth == 320) return &AMDGPU::VReg_320RegClass; - if (BitWidth <= 352) + if (BitWidth == 352) return &AMDGPU::VReg_352RegClass; - if (BitWidth <= 384) + if (BitWidth == 384) return &AMDGPU::VReg_384RegClass; - if (BitWidth <= 512) + if (BitWidth == 512) return &AMDGPU::VReg_512RegClass; - if (BitWidth <= 1024) + if (BitWidth == 1024) return &AMDGPU::VReg_1024RegClass; return nullptr; @@ -2549,31 +2605,31 @@ getAnyVGPRClassForBitWidth(unsigned BitWidth) { static const TargetRegisterClass * getAlignedVGPRClassForBitWidth(unsigned BitWidth) { - if (BitWidth <= 64) + if (BitWidth == 64) return &AMDGPU::VReg_64_Align2RegClass; - if (BitWidth <= 96) + if (BitWidth == 96) return &AMDGPU::VReg_96_Align2RegClass; - if (BitWidth <= 128) + if (BitWidth == 128) return &AMDGPU::VReg_128_Align2RegClass; - if (BitWidth <= 160) + if (BitWidth == 160) return &AMDGPU::VReg_160_Align2RegClass; - if (BitWidth <= 192) + if (BitWidth == 192) return &AMDGPU::VReg_192_Align2RegClass; - if (BitWidth <= 224) + if (BitWidth == 224) return &AMDGPU::VReg_224_Align2RegClass; - if (BitWidth <= 256) + if (BitWidth == 256) return &AMDGPU::VReg_256_Align2RegClass; - if (BitWidth <= 288) + if (BitWidth == 288) return &AMDGPU::VReg_288_Align2RegClass; - if (BitWidth <= 320) + if (BitWidth == 320) return &AMDGPU::VReg_320_Align2RegClass; - if (BitWidth <= 352) + if (BitWidth == 352) return &AMDGPU::VReg_352_Align2RegClass; - if (BitWidth <= 384) + if (BitWidth == 384) return &AMDGPU::VReg_384_Align2RegClass; - if (BitWidth <= 512) + if (BitWidth == 512) return &AMDGPU::VReg_512_Align2RegClass; - if (BitWidth <= 1024) + if (BitWidth == 1024) return &AMDGPU::VReg_1024_Align2RegClass; return nullptr; @@ -2583,9 +2639,9 @@ const TargetRegisterClass * SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { if (BitWidth == 1) return &AMDGPU::VReg_1RegClass; - if (BitWidth <= 16) + if (BitWidth == 16) return &AMDGPU::VGPR_LO16RegClass; - if (BitWidth <= 32) + if (BitWidth == 32) return &AMDGPU::VGPR_32RegClass; return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth) : getAnyVGPRClassForBitWidth(BitWidth); @@ -2593,31 +2649,31 @@ SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const { static const TargetRegisterClass * getAnyAGPRClassForBitWidth(unsigned BitWidth) { - if (BitWidth <= 64) + if (BitWidth == 64) return &AMDGPU::AReg_64RegClass; - if (BitWidth <= 96) + if (BitWidth == 96) return &AMDGPU::AReg_96RegClass; - if (BitWidth <= 128) + if (BitWidth == 128) return &AMDGPU::AReg_128RegClass; - if (BitWidth <= 160) + if (BitWidth == 160) return &AMDGPU::AReg_160RegClass; - if (BitWidth <= 192) + if (BitWidth == 192) return &AMDGPU::AReg_192RegClass; - if (BitWidth <= 224) + if (BitWidth == 224) return &AMDGPU::AReg_224RegClass; - if (BitWidth <= 256) + if (BitWidth == 256) return &AMDGPU::AReg_256RegClass; - if (BitWidth <= 288) + if (BitWidth == 288) return &AMDGPU::AReg_288RegClass; - if (BitWidth <= 320) + if (BitWidth == 320) return &AMDGPU::AReg_320RegClass; - if (BitWidth <= 352) + if (BitWidth == 352) return &AMDGPU::AReg_352RegClass; - if (BitWidth <= 384) + if (BitWidth == 384) return &AMDGPU::AReg_384RegClass; - if (BitWidth <= 512) + if (BitWidth == 512) return &AMDGPU::AReg_512RegClass; - if (BitWidth <= 1024) + if (BitWidth == 1024) return &AMDGPU::AReg_1024RegClass; return nullptr; @@ -2625,31 +2681,31 @@ getAnyAGPRClassForBitWidth(unsigned BitWidth) { static const TargetRegisterClass * getAlignedAGPRClassForBitWidth(unsigned BitWidth) { - if (BitWidth <= 64) + if (BitWidth == 64) return &AMDGPU::AReg_64_Align2RegClass; - if (BitWidth <= 96) + if (BitWidth == 96) return &AMDGPU::AReg_96_Align2RegClass; - if (BitWidth <= 128) + if (BitWidth == 128) return &AMDGPU::AReg_128_Align2RegClass; - if (BitWidth <= 160) + if (BitWidth == 160) return &AMDGPU::AReg_160_Align2RegClass; - if (BitWidth <= 192) + if (BitWidth == 192) return &AMDGPU::AReg_192_Align2RegClass; - if (BitWidth <= 224) + if (BitWidth == 224) return &AMDGPU::AReg_224_Align2RegClass; - if (BitWidth <= 256) + if (BitWidth == 256) return &AMDGPU::AReg_256_Align2RegClass; - if (BitWidth <= 288) + if (BitWidth == 288) return &AMDGPU::AReg_288_Align2RegClass; - if (BitWidth <= 320) + if (BitWidth == 320) return &AMDGPU::AReg_320_Align2RegClass; - if (BitWidth <= 352) + if (BitWidth == 352) return &AMDGPU::AReg_352_Align2RegClass; - if (BitWidth <= 384) + if (BitWidth == 384) return &AMDGPU::AReg_384_Align2RegClass; - if (BitWidth <= 512) + if (BitWidth == 512) return &AMDGPU::AReg_512_Align2RegClass; - if (BitWidth <= 1024) + if (BitWidth == 1024) return &AMDGPU::AReg_1024_Align2RegClass; return nullptr; @@ -2657,9 +2713,9 @@ getAlignedAGPRClassForBitWidth(unsigned BitWidth) { const TargetRegisterClass * SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { - if (BitWidth <= 16) + if (BitWidth == 16) return &AMDGPU::AGPR_LO16RegClass; - if (BitWidth <= 32) + if (BitWidth == 32) return &AMDGPU::AGPR_32RegClass; return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth) : getAnyAGPRClassForBitWidth(BitWidth); @@ -2667,31 +2723,31 @@ SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const { static const TargetRegisterClass * getAnyVectorSuperClassForBitWidth(unsigned BitWidth) { - if (BitWidth <= 64) + if (BitWidth == 64) return &AMDGPU::AV_64RegClass; - if (BitWidth <= 96) + if (BitWidth == 96) return &AMDGPU::AV_96RegClass; - if (BitWidth <= 128) + if (BitWidth == 128) return &AMDGPU::AV_128RegClass; - if (BitWidth <= 160) + if (BitWidth == 160) return &AMDGPU::AV_160RegClass; - if (BitWidth <= 192) + if (BitWidth == 192) return &AMDGPU::AV_192RegClass; - if (BitWidth <= 224) + if (BitWidth == 224) return &AMDGPU::AV_224RegClass; - if (BitWidth <= 256) + if (BitWidth == 256) return &AMDGPU::AV_256RegClass; - if (BitWidth <= 288) + if (BitWidth == 288) return &AMDGPU::AV_288RegClass; - if (BitWidth <= 320) + if (BitWidth == 320) return &AMDGPU::AV_320RegClass; - if (BitWidth <= 352) + if (BitWidth == 352) return &AMDGPU::AV_352RegClass; - if (BitWidth <= 384) + if (BitWidth == 384) return &AMDGPU::AV_384RegClass; - if (BitWidth <= 512) + if (BitWidth == 512) return &AMDGPU::AV_512RegClass; - if (BitWidth <= 1024) + if (BitWidth == 1024) return &AMDGPU::AV_1024RegClass; return nullptr; @@ -2699,31 +2755,31 @@ getAnyVectorSuperClassForBitWidth(unsigned BitWidth) { static const TargetRegisterClass * getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) { - if (BitWidth <= 64) + if (BitWidth == 64) return &AMDGPU::AV_64_Align2RegClass; - if (BitWidth <= 96) + if (BitWidth == 96) return &AMDGPU::AV_96_Align2RegClass; - if (BitWidth <= 128) + if (BitWidth == 128) return &AMDGPU::AV_128_Align2RegClass; - if (BitWidth <= 160) + if (BitWidth == 160) return &AMDGPU::AV_160_Align2RegClass; - if (BitWidth <= 192) + if (BitWidth == 192) return &AMDGPU::AV_192_Align2RegClass; - if (BitWidth <= 224) + if (BitWidth == 224) return &AMDGPU::AV_224_Align2RegClass; - if (BitWidth <= 256) + if (BitWidth == 256) return &AMDGPU::AV_256_Align2RegClass; - if (BitWidth <= 288) + if (BitWidth == 288) return &AMDGPU::AV_288_Align2RegClass; - if (BitWidth <= 320) + if (BitWidth == 320) return &AMDGPU::AV_320_Align2RegClass; - if (BitWidth <= 352) + if (BitWidth == 352) return &AMDGPU::AV_352_Align2RegClass; - if (BitWidth <= 384) + if (BitWidth == 384) return &AMDGPU::AV_384_Align2RegClass; - if (BitWidth <= 512) + if (BitWidth == 512) return &AMDGPU::AV_512_Align2RegClass; - if (BitWidth <= 1024) + if (BitWidth == 1024) return &AMDGPU::AV_1024_Align2RegClass; return nullptr; @@ -2731,9 +2787,9 @@ getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) { const TargetRegisterClass * SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { - if (BitWidth <= 16) + if (BitWidth == 16) return &AMDGPU::VGPR_LO16RegClass; - if (BitWidth <= 32) + if (BitWidth == 32) return &AMDGPU::AV_32RegClass; return ST.needsAlignedVGPRs() ? getAlignedVectorSuperClassForBitWidth(BitWidth) @@ -2742,35 +2798,35 @@ SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const { const TargetRegisterClass * SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) { - if (BitWidth <= 16) + if (BitWidth == 16) return &AMDGPU::SGPR_LO16RegClass; - if (BitWidth <= 32) + if (BitWidth == 32) return &AMDGPU::SReg_32RegClass; - if (BitWidth <= 64) + if (BitWidth == 64) return &AMDGPU::SReg_64RegClass; - if (BitWidth <= 96) + if (BitWidth == 96) return &AMDGPU::SGPR_96RegClass; - if (BitWidth <= 128) + if (BitWidth == 128) return &AMDGPU::SGPR_128RegClass; - if (BitWidth <= 160) + if (BitWidth == 160) return &AMDGPU::SGPR_160RegClass; - if (BitWidth <= 192) + if (BitWidth == 192) return &AMDGPU::SGPR_192RegClass; - if (BitWidth <= 224) + if (BitWidth == 224) return &AMDGPU::SGPR_224RegClass; - if (BitWidth <= 256) + if (BitWidth == 256) return &AMDGPU::SGPR_256RegClass; - if (BitWidth <= 288) + if (BitWidth == 288) return &AMDGPU::SGPR_288RegClass; - if (BitWidth <= 320) + if (BitWidth == 320) return &AMDGPU::SGPR_320RegClass; - if (BitWidth <= 352) + if (BitWidth == 352) return &AMDGPU::SGPR_352RegClass; - if (BitWidth <= 384) + if (BitWidth == 384) return &AMDGPU::SGPR_384RegClass; - if (BitWidth <= 512) + if (BitWidth == 512) return &AMDGPU::SGPR_512RegClass; - if (BitWidth <= 1024) + if (BitWidth == 1024) return &AMDGPU::SGPR_1024RegClass; return nullptr; @@ -2863,13 +2919,12 @@ bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { /// Returns a lowest register that is not used at any point in the function. /// If all registers are used, then this function will return -/// AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return +/// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return /// highest unused register. -MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, - const TargetRegisterClass *RC, - const MachineFunction &MF, - bool ReserveHighestVGPR) const { - if (ReserveHighestVGPR) { +MCRegister SIRegisterInfo::findUnusedRegister( + const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, + const MachineFunction &MF, bool ReserveHighestRegister) const { + if (ReserveHighestRegister) { for (MCRegister Reg : reverse(*RC)) if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) return Reg; @@ -2881,9 +2936,19 @@ MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, return MCRegister(); } +bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI, + const RegisterBankInfo &RBI, + Register Reg) const { + auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo()); + if (!RB) + return false; + + return !RBI.isDivergentRegBank(RB); +} + ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const { - const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC); + const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC); assert(RegBitWidth >= 32 && RegBitWidth <= 1024); const unsigned RegDWORDs = RegBitWidth / 32; @@ -3084,9 +3149,8 @@ MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg, DefIdx = V->def; } else { // Find last def. - for (MCRegUnitIterator Units(Reg.asMCReg(), this); Units.isValid(); - ++Units) { - LiveRange &LR = LIS->getRegUnit(*Units); + for (MCRegUnit Unit : regunits(Reg.asMCReg())) { + LiveRange &LR = LIS->getRegUnit(Unit); if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { if (!DefIdx.isValid() || MDT.dominates(LIS->getInstructionFromIndex(DefIdx), @@ -3173,3 +3237,19 @@ ArrayRef<MCPhysReg> SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const { return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); } + +unsigned +SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC, + unsigned SubReg) const { + switch (RC->TSFlags & SIRCFlags::RegKindMask) { + case SIRCFlags::HasSGPR: + return std::min(128u, getSubRegIdxSize(SubReg)); + case SIRCFlags::HasAGPR: + case SIRCFlags::HasVGPR: + case SIRCFlags::HasVGPR | SIRCFlags::HasAGPR: + return std::min(32u, getSubRegIdxSize(SubReg)); + default: + break; + } + return 0; +} diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index e9ddf82fb5c8..17fce43891c5 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -70,6 +70,12 @@ public: return SpillSGPRToVGPR; } + /// Return the largest available SGPR aligned to \p Align for the register + /// class \p RC. + MCRegister getAlignedHighSGPRForRC(const MachineFunction &MF, + const unsigned Align, + const TargetRegisterClass *RC) const; + /// Return the end register initially reserved for the scratch buffer in case /// spilling is needed. MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; @@ -136,14 +142,17 @@ public: void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill = true) const; - /// If \p OnlyToVGPR is true, this will only succeed if this + /// If \p OnlyToVGPR is true, this will only succeed if this manages to find a + /// free VGPR lane to spill. bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr, - bool OnlyToVGPR = false) const; + bool OnlyToVGPR = false, + bool SpillToPhysVGPRLane = false) const; bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr, - bool OnlyToVGPR = false) const; + bool OnlyToVGPR = false, + bool SpillToPhysVGPRLane = false) const; bool spillEmergencySGPR(MachineBasicBlock::iterator MI, MachineBasicBlock &RestoreMBB, Register SGPR, @@ -157,10 +166,10 @@ public: unsigned FIOperandNum, RegScavenger *RS) const override; - bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, - int FI, RegScavenger *RS, - SlotIndexes *Indexes = nullptr, - LiveIntervals *LIS = nullptr) const; + bool eliminateSGPRToVGPRSpillFrameIndex( + MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, + SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr, + bool SpillToPhysVGPRLane = false) const; StringRef getRegAsmName(MCRegister Reg) const override; @@ -286,10 +295,17 @@ public: return isVGPR(MRI, Reg) || isAGPR(MRI, Reg); } + // FIXME: SGPRs are assumed to be uniform, but this is not true for i1 SGPRs + // (such as VCC) which hold a wave-wide vector of boolean values. Examining + // just the register class is not suffcient; it needs to be combined with a + // value type. The next predicate isUniformReg() does this correctly. bool isDivergentRegClass(const TargetRegisterClass *RC) const override { return !isSGPRClass(RC); } + bool isUniformReg(const MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, + Register Reg) const override; + ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const; @@ -411,6 +427,25 @@ public: int64_t InstrOffset, MachineMemOperand *MMO, RegScavenger *RS, LivePhysRegs *LiveRegs = nullptr) const; + + // Return alignment in register file of first register in a register tuple. + unsigned getRegClassAlignmentNumBits(const TargetRegisterClass *RC) const { + return (RC->TSFlags & SIRCFlags::RegTupleAlignUnitsMask) * 32; + } + + // Check if register class RC has required alignment. + bool isRegClassAligned(const TargetRegisterClass *RC, + unsigned AlignNumBits) const { + assert(AlignNumBits != 0); + unsigned RCAlign = getRegClassAlignmentNumBits(RC); + return RCAlign == AlignNumBits || + (RCAlign > AlignNumBits && (RCAlign % AlignNumBits) == 0); + } + + // Return alignment of a SubReg relative to start of a register in RC class. + // No check if the subreg is supported by the current RC is made. + unsigned getSubRegAlignmentNumBits(const TargetRegisterClass *RC, + unsigned SubReg) const; }; } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 12053c4b8724..b2b1b458a63a 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -10,16 +10,6 @@ // Subregister declarations //===----------------------------------------------------------------------===// -class Indexes<int N> { - list<int> all = [0, 1, 2, 3, 4, 5, 6 , 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31]; - - // Returns list of indexes [0..N) - list<int> slice = !filter(i, all, !lt(i, N)); -} - let Namespace = "AMDGPU" in { def lo16 : SubRegIndex<16, 0>; @@ -35,13 +25,11 @@ foreach Index = 1...31 in { } foreach Size = {2...6,8,16} in { - foreach Index = Indexes<!sub(33, Size)>.slice in { - def !interleave(!foreach(cur, Indexes<Size>.slice, "sub"#!add(cur, Index)), - "_") : + foreach Index = !range(!sub(33, Size)) in { + def !interleave(!foreach(cur, !range(Size), "sub"#!add(cur, Index)), "_") : SubRegIndex<!mul(Size, 32), !shl(Index, 5)> { let CoveringSubRegIndices = - !foreach(cur, Indexes<Size>.slice, - !cast<SubRegIndex>(sub#!add(cur, Index))); + !foreach(cur, !range(Size), !cast<SubRegIndex>(sub#!add(cur, Index))); } } } @@ -150,10 +138,14 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList> // For scalar register classes. field bit HasSGPR = 0; + // Alignment of the first register in tuple (in 32-bit units). + field int RegTupleAlignUnits = 1; + // These need to be kept in sync with the enum SIRCFlags. - let TSFlags{0} = HasVGPR; - let TSFlags{1} = HasAGPR; - let TSFlags{2} = HasSGPR; + let TSFlags{1-0} = RegTupleAlignUnits; + let TSFlags{2} = HasVGPR; + let TSFlags{3} = HasAGPR; + let TSFlags{4} = HasSGPR; } multiclass SIRegLoHi16 <string n, bits<16> regIdx, bit ArtificialHigh = 1, @@ -421,7 +413,7 @@ def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, def SGPR_64Regs : SIRegisterTuples<getSubRegs<2>.ret, SGPR_32, 105, 2, 2, "s">; // SGPR 96-bit registers. No operations use these, but for symmetry with 96-bit VGPRs. -def SGPR_96Regs : SIRegisterTuples<getSubRegs<3>.ret, SGPR_32, 105, 3, 3, "s">; +def SGPR_96Regs : SIRegisterTuples<getSubRegs<3>.ret, SGPR_32, 105, 4, 3, "s">; // SGPR 128-bit registers def SGPR_128Regs : SIRegisterTuples<getSubRegs<4>.ret, SGPR_32, 105, 4, 4, "s">; @@ -774,7 +766,7 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, SRC_PRIVATE_LIMIT_HI_LO16, SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, SRC_EXECZ_LO16, SRC_SCC_LO16, EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16)> { let Size = 16; - let AllocationPriority = 0; + let isAllocatable = 0; let BaseClassOrder = 16; } @@ -817,6 +809,21 @@ def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], let HasSGPR = 1; } +// CCR (call clobbered registers) SGPR 64-bit registers +def CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, (add (trunc SGPR_64, 15))> { + let CopyCost = SGPR_64.CopyCost; + let AllocationPriority = SGPR_64.AllocationPriority; + let HasSGPR = 1; +} + +// Call clobbered 64-bit SGPRs for AMDGPU_Gfx CC +def Gfx_CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, + (add (trunc (shl SGPR_64, 18), 14))> { // s[36:37]-s[s62:63] + let CopyCost = SGPR_64.CopyCost; + let AllocationPriority = SGPR_64.AllocationPriority; + let HasSGPR = 1; +} + def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add TTMP_64Regs)> { let isAllocatable = 0; @@ -931,6 +938,7 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> { def _Align2 : VRegClassBase<numRegs, regTypes, (decimate regList, 2)> { // Give aligned class higher priority in base class resolution let BaseClassOrder = !sub(!mul(numRegs, 32), 1); + let RegTupleAlignUnits = 2; } } } @@ -965,6 +973,7 @@ multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> { def _Align2 : VRegClassBase<numRegs, regTypes, (decimate regList, 2)> { // Give aligned class higher priority in base class resolution let BaseClassOrder = !sub(!mul(numRegs, 32), 1); + let RegTupleAlignUnits = 2; } } } @@ -1033,10 +1042,12 @@ multiclass AVRegClass<int numRegs, list<ValueType> regTypes, // Define the regular class. def "" : VRegClassBase<numRegs, regTypes, (add vregList, aregList)>; - // Define 2-aligned variant + // Define 2-aligned variant def _Align2 : VRegClassBase<numRegs, regTypes, (add (decimate vregList, 2), - (decimate aregList, 2))>; + (decimate aregList, 2))> { + let RegTupleAlignUnits = 2; + } } } @@ -1066,185 +1077,123 @@ class RegImmMatcher<string name> : AsmOperandClass { let RenderMethod = "addRegOrImmOperands"; } -// For VOP1,2,C True16 instructions. Uses first 128 32-bit VGPRs only -multiclass SIRegOperand16 <string rc, string MatchName, string opType, - string rc_suffix = "_32"> { - let OperandNamespace = "AMDGPU" in { - def _b16_Lo128 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix#"_Lo128")> { - let OperandType = opType#"_INT16"; - let ParserMatchClass = RegImmMatcher<MatchName#"B16_Lo128">; - let DecoderMethod = "decodeOperand_VSrc16"; - } - - def _f16_Lo128 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix#"_Lo128")> { - let OperandType = opType#"_FP16"; - let ParserMatchClass = RegImmMatcher<MatchName#"F16_Lo128">; - let DecoderMethod = "decodeOperand_" # rc # "_16"; - } - } -} - - -multiclass SIRegOperand32 <string rc, string MatchName, string opType, - string rc_suffix = "_32"> { - let OperandNamespace = "AMDGPU" in { - def _b16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { - let OperandType = opType#"_INT16"; - let ParserMatchClass = RegImmMatcher<MatchName#"B16">; - let DecoderMethod = "decodeOperand_VSrc16"; - } - - def _f16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { - let OperandType = opType#"_FP16"; - let ParserMatchClass = RegImmMatcher<MatchName#"F16">; - let DecoderMethod = "decodeOperand_" # rc # "_16"; - } +class RegOrImmOperand <string RegisterClassName, string OperandTypeName, + string ParserMatchClassName, string decoderImmSize> + : RegisterOperand<!cast<RegisterClass>(RegisterClassName)> { + let OperandNamespace = "AMDGPU"; + let OperandType = OperandTypeName; + let ParserMatchClass = RegImmMatcher<ParserMatchClassName>; + let DecoderMethod = "decodeOperand_" # RegisterClassName # decoderImmSize; + } - def _b32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { - let OperandType = opType#"_INT32"; - let ParserMatchClass = RegImmMatcher<MatchName#"B32">; - let DecoderMethod = "decodeOperand_" # rc # rc_suffix; - } +class RegOrB16 <string RegisterClass, string OperandTypePrefix> + : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_INT16", + !subst("_b16", "B16", NAME), "_Imm16">; - def _f32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { - let OperandType = opType#"_FP32"; - let ParserMatchClass = RegImmMatcher<MatchName#"F32">; - let DecoderMethod = "decodeOperand_" # rc # rc_suffix; - } +class RegOrF16 <string RegisterClass, string OperandTypePrefix> + : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16", + !subst("_f16", "F16", NAME), "_Imm16">; - def _v2b16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { - let OperandType = opType#"_V2INT16"; - let ParserMatchClass = RegImmMatcher<MatchName#"V2B16">; - let DecoderMethod = "decodeOperand_VSrcV216"; - } +class RegOrB32 <string RegisterClass, string OperandTypePrefix> + : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_INT32", + !subst("_b32", "B32", NAME), "_Imm32">; - def _v2f16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { - let OperandType = opType#"_V2FP16"; - let ParserMatchClass = RegImmMatcher<MatchName#"V2F16">; - let DecoderMethod = "decodeOperand_VSrcV216"; - } - } -} +class RegOrF32 <string RegisterClass, string OperandTypePrefix> + : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP32", + !subst("_f32", "F32", NAME), "_Imm32">; -multiclass SIRegOperand64 <string rc, string MatchName, string opType, - string rc_suffix = "_64", bit Vectors = 1> { - let OperandNamespace = "AMDGPU" in { - def _b64 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { - let OperandType = opType#"_INT64"; - let ParserMatchClass = RegImmMatcher<MatchName#"B64">; - } +class RegOrV2B16 <string RegisterClass, string OperandTypePrefix> + : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2INT16", + !subst("_v2b16", "V2B16", NAME), "_Imm16">; - def _f64 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { - let OperandType = opType#"_FP64"; - let ParserMatchClass = RegImmMatcher<MatchName#"F64">; - } +class RegOrV2F16 <string RegisterClass, string OperandTypePrefix> + : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2FP16", + !subst("_v2f16", "V2F16", NAME), "_Imm16">; - if Vectors then - def _v2f32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { - let OperandType = opType#"_V2FP32"; - let ParserMatchClass = RegImmMatcher<MatchName#"V2FP32">; - let DecoderMethod = "decodeOperand_VSrcV232"; - } - if Vectors then - def _v2b32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { - let OperandType = opType#"_V2INT32"; - let ParserMatchClass = RegImmMatcher<MatchName#"V2INT32">; - let DecoderMethod = "decodeOperand_VSrcV232"; - } - } -} +class RegOrF64 <string RegisterClass, string OperandTypePrefix> + : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP64", + !subst("_f64", "F64", NAME), "_Imm64">; -multiclass SIRegOperand <string rc, string MatchName, string opType> : - SIRegOperand32<rc, MatchName, opType>, - SIRegOperand64<rc, MatchName, opType>; +class RegOrB64 <string RegisterClass, string OperandTypePrefix> + : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_INT64", + !subst("_b64", "B64", NAME), "_Imm64">; -// FIXME: 64-bit sources can sometimes use 32-bit constants. -multiclass RegImmOperand <string rc, string MatchName> - : SIRegOperand<rc, MatchName, "OPERAND_REG_IMM">; +class RegOrV2F32 <string RegisterClass, string OperandTypePrefix> + : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2FP32", + !subst("_v2f32", "V2FP32", NAME), "_Imm32">; -multiclass RegInlineOperand <string rc, string MatchName> - : SIRegOperand<rc, MatchName, "OPERAND_REG_INLINE_C">; +class RegOrV2B32 <string RegisterClass, string OperandTypePrefix> + : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2INT32", + !subst("_v2b32", "V2INT32", NAME), "_Imm32">; -multiclass RegInlineOperand32 <string rc, string MatchName, - string rc_suffix = "_32"> - : SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_C", rc_suffix>; +// For VOP1,2,C True16 instructions. _Lo128 use first 128 32-bit VGPRs only. +class RegOrB16_Lo128 <string RegisterClass, string OperandTypePrefix> + : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_INT16", + !subst("_b16_Lo128", "B16_Lo128", NAME), "_Imm16">; -multiclass RegInlineOperand64 <string rc, string MatchName, - string rc_suffix = "_64"> - : SIRegOperand64<rc, MatchName, "OPERAND_REG_INLINE_C", rc_suffix>; +class RegOrF16_Lo128 <string RegisterClass, string OperandTypePrefix> + : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16", + !subst("_f16_Lo128", "F16_Lo128", NAME), "_Imm16">; -multiclass RegInlineOperandAC <string rc, string MatchName, - string rc_suffix = "_32"> - : SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_AC", rc_suffix>; +// Deferred operands +class RegOrF16_Deferred <string RegisterClass, string OperandTypePrefix> + : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16_DEFERRED", + !subst("_f16_Deferred", "F16", NAME), "_Deferred_Imm16">; -multiclass RegInlineOperandAC64 <string rc, string MatchName, - string rc_suffix = "_64"> - : SIRegOperand64<rc, MatchName, "OPERAND_REG_INLINE_AC", rc_suffix, 0>; +class RegOrF32_Deferred <string RegisterClass, string OperandTypePrefix> + : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP32_DEFERRED", + !subst("_f32_Deferred", "F32", NAME), "_Deferred_Imm32">; +class RegOrF16_Lo128_Deferred <string RegisterClass, + string OperandTypePrefix> + : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16_DEFERRED", + !subst("_f16_Lo128_Deferred", "F16_Lo128", NAME), + "_Deferred_Imm16">; //===----------------------------------------------------------------------===// // SSrc_* Operands with an SGPR or a 32-bit immediate //===----------------------------------------------------------------------===// -defm SSrc : RegImmOperand<"SReg", "SSrc">; +def SSrc_b32 : RegOrB32 <"SReg_32", "OPERAND_REG_IMM">; +def SSrc_f32 : RegOrF32 <"SReg_32", "OPERAND_REG_IMM">; +def SSrc_b64 : RegOrB64 <"SReg_64", "OPERAND_REG_IMM">; -def SSrcOrLds_b32 : RegisterOperand<SRegOrLds_32> { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_REG_IMM_INT32"; - let ParserMatchClass = RegImmMatcher<"SSrcOrLdsB32">; -} +def SSrcOrLds_b32 : RegOrB32 <"SRegOrLds_32", "OPERAND_REG_IMM">; //===----------------------------------------------------------------------===// // SCSrc_* Operands with an SGPR or a inline constant //===----------------------------------------------------------------------===// -defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ; +def SCSrc_b32 : RegOrB32 <"SReg_32", "OPERAND_REG_INLINE_C">; +def SCSrc_b64 : RegOrB64 <"SReg_64", "OPERAND_REG_INLINE_C">; //===----------------------------------------------------------------------===// // VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate //===----------------------------------------------------------------------===// -defm VSrc : RegImmOperand<"VS", "VSrc">; -defm VSrcT : SIRegOperand16<"VS", "VSrcT", "OPERAND_REG_IMM">; +def VSrc_b16 : RegOrB16 <"VS_32", "OPERAND_REG_IMM">; +def VSrc_f16 : RegOrF16 <"VS_32", "OPERAND_REG_IMM">; +def VSrc_b32 : RegOrB32 <"VS_32", "OPERAND_REG_IMM">; +def VSrc_f32 : RegOrF32 <"VS_32", "OPERAND_REG_IMM">; +def VSrc_v2b16 : RegOrV2B16 <"VS_32", "OPERAND_REG_IMM">; +def VSrc_v2f16 : RegOrV2F16 <"VS_32", "OPERAND_REG_IMM">; +def VSrc_b64 : RegOrB64 <"VS_64", "OPERAND_REG_IMM">; +def VSrc_f64 : RegOrF64 <"VS_64", "OPERAND_REG_IMM">; +def VSrc_v2b32 : RegOrV2B32 <"VS_64", "OPERAND_REG_IMM">; +def VSrc_v2f32 : RegOrV2F32 <"VS_64", "OPERAND_REG_IMM">; -def VSrc_128 : RegisterOperand<VReg_128> { - let DecoderMethod = "DecodeVS_128RegisterClass"; -} +def VSrcT_b16_Lo128 : RegOrB16_Lo128 <"VS_32_Lo128", "OPERAND_REG_IMM">; +def VSrcT_f16_Lo128 : RegOrF16_Lo128 <"VS_32_Lo128", "OPERAND_REG_IMM">; //===----------------------------------------------------------------------===// // VSrc_*_Deferred Operands with an SGPR, VGPR or a 32-bit immediate for use // with FMAMK/FMAAK //===----------------------------------------------------------------------===// -multiclass SIRegOperand16_Deferred <string rc, string MatchName, string opType, - string rc_suffix = "_32"> { - let OperandNamespace = "AMDGPU" in { - def _f16_Lo128_Deferred : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix#"_Lo128")> { - let OperandType = opType#"_FP16_DEFERRED"; - let ParserMatchClass = RegImmMatcher<MatchName#"F16_Lo128">; - let DecoderMethod = "decodeOperand_" # rc # "_16_Deferred"; - } - } -} - -multiclass SIRegOperand32_Deferred <string rc, string MatchName, string opType, - string rc_suffix = "_32"> { - let OperandNamespace = "AMDGPU" in { - def _f16_Deferred : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { - let OperandType = opType#"_FP16_DEFERRED"; - let ParserMatchClass = RegImmMatcher<MatchName#"F16">; - let DecoderMethod = "decodeOperand_" # rc # "_16_Deferred"; - } - - def _f32_Deferred : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> { - let OperandType = opType#"_FP32_DEFERRED"; - let ParserMatchClass = RegImmMatcher<MatchName#"F32">; - let DecoderMethod = "decodeOperand_" # rc # "_32_Deferred"; - } - } -} +def VSrc_f16_Deferred : RegOrF16_Deferred<"VS_32", "OPERAND_REG_IMM">; +def VSrc_f32_Deferred : RegOrF32_Deferred<"VS_32", "OPERAND_REG_IMM">; -defm VSrc : SIRegOperand32_Deferred<"VS", "VSrc", "OPERAND_REG_IMM">; -defm VSrcT : SIRegOperand16_Deferred<"VS", "VSrcT", "OPERAND_REG_IMM">; +def VSrcT_f16_Lo128_Deferred : RegOrF16_Lo128_Deferred<"VS_32_Lo128", + "OPERAND_REG_IMM">; //===----------------------------------------------------------------------===// // VRegSrc_* Operands with a VGPR @@ -1253,8 +1202,7 @@ defm VSrcT : SIRegOperand16_Deferred<"VS", "VSrcT", "OPERAND_REG_IMM">; // This is for operands with the enum(9), VSrc encoding restriction, // but only allows VGPRs. def VRegSrc_32 : RegisterOperand<VGPR_32> { - //let ParserMatchClass = RegImmMatcher<"VRegSrc32">; - let DecoderMethod = "DecodeVS_32RegisterClass"; + let DecoderMethod = "decodeOperand_VGPR_32"; } def VRegSrc_64 : RegisterOperand<VReg_64> { @@ -1269,6 +1217,10 @@ def VRegSrc_256 : RegisterOperand<VReg_256> { let DecoderMethod = "decodeOperand_VReg_256"; } +def VRegOrLdsSrc_32 : RegisterOperand<VRegOrLds_32> { + let DecoderMethod = "decodeOperand_VRegOrLds_32"; +} + //===----------------------------------------------------------------------===// // VGPRSrc_* //===----------------------------------------------------------------------===// @@ -1286,7 +1238,7 @@ def VGPRSrc_32_Lo128 : RegisterOperand<VGPR_32_Lo128> { //===----------------------------------------------------------------------===// def ARegSrc_32 : RegisterOperand<AGPR_32> { - let DecoderMethod = "DecodeAGPR_32RegisterClass"; + let DecoderMethod = "decodeOperand_AGPR_32"; let EncoderMethod = "getAVOperandEncoding"; } @@ -1294,38 +1246,42 @@ def ARegSrc_32 : RegisterOperand<AGPR_32> { // VCSrc_* Operands with an SGPR, VGPR or an inline constant //===----------------------------------------------------------------------===// -defm VCSrc : RegInlineOperand<"VS", "VCSrc">; -defm VCSrcT : SIRegOperand16<"VS", "VCSrcT", "OPERAND_REG_INLINE_C">; +def VCSrc_b16 : RegOrB16 <"VS_32", "OPERAND_REG_INLINE_C">; +def VCSrc_f16 : RegOrF16 <"VS_32", "OPERAND_REG_INLINE_C">; +def VCSrc_b32 : RegOrB32 <"VS_32", "OPERAND_REG_INLINE_C">; +def VCSrc_f32 : RegOrF32 <"VS_32", "OPERAND_REG_INLINE_C">; +def VCSrc_v2b16 : RegOrV2B16 <"VS_32", "OPERAND_REG_INLINE_C">; +def VCSrc_v2f16 : RegOrV2F16 <"VS_32", "OPERAND_REG_INLINE_C">; //===----------------------------------------------------------------------===// // VISrc_* Operands with a VGPR or an inline constant //===----------------------------------------------------------------------===// -defm VISrc : RegInlineOperand32<"VGPR", "VISrc">; -let DecoderMethod = "decodeOperand_VReg_64" in -defm VISrc_64 : RegInlineOperand64<"VReg", "VISrc_64", "_64">; -defm VISrc_128 : RegInlineOperandAC<"VReg", "VISrc_128", "_128">; -let DecoderMethod = "decodeOperand_VReg_256" in -defm VISrc_256 : RegInlineOperand64<"VReg", "VISrc_256", "_256">; -defm VISrc_512 : RegInlineOperandAC<"VReg", "VISrc_512", "_512">; -defm VISrc_1024 : RegInlineOperandAC<"VReg", "VISrc_1024", "_1024">; +def VISrc_64_f64 : RegOrF64 <"VReg_64", "OPERAND_REG_INLINE_C">; +def VISrc_128_b32 : RegOrB32 <"VReg_128", "OPERAND_REG_INLINE_C">; +def VISrc_128_f32 : RegOrF32 <"VReg_128", "OPERAND_REG_INLINE_C">; +def VISrc_256_f64 : RegOrF64 <"VReg_256", "OPERAND_REG_INLINE_C">; +def VISrc_512_b32 : RegOrB32 <"VReg_512", "OPERAND_REG_INLINE_C">; +def VISrc_512_f32 : RegOrF32 <"VReg_512", "OPERAND_REG_INLINE_C">; +def VISrc_1024_b32 : RegOrB32 <"VReg_1024", "OPERAND_REG_INLINE_C">; +def VISrc_1024_f32 : RegOrF32 <"VReg_1024", "OPERAND_REG_INLINE_C">; //===----------------------------------------------------------------------===// // AVSrc_*, AVDst_*, AVLdSt_* Operands with an AGPR or VGPR //===----------------------------------------------------------------------===// def AVSrc_32 : RegisterOperand<AV_32> { - let DecoderMethod = "DecodeAV_32RegisterClass"; + let DecoderMethod = "decodeOperand_AV_32"; let EncoderMethod = "getAVOperandEncoding"; } def AVSrc_64 : RegisterOperand<AV_64> { - let DecoderMethod = "DecodeAV_64RegisterClass"; + let DecoderMethod = "decodeOperand_AV_64"; let EncoderMethod = "getAVOperandEncoding"; } def AVSrc_128 : RegisterOperand<AV_128> { - let DecoderMethod = "DecodeAV_128RegisterClass"; + let DecoderMethod = "decodeOperand_AV_128"; let EncoderMethod = "getAVOperandEncoding"; } @@ -1368,12 +1324,11 @@ def AVLdSt_160 : RegisterOperand<AV_160> { // ACSrc_* Operands with an AGPR or an inline constant //===----------------------------------------------------------------------===// -defm AISrc : RegInlineOperandAC<"AGPR", "AISrc">; -defm AISrc_128 : RegInlineOperandAC<"AReg", "AISrc_128", "_128">; -defm AISrc_512 : RegInlineOperandAC<"AReg", "AISrc_512", "_512">; -defm AISrc_1024 : RegInlineOperandAC<"AReg", "AISrc_1024", "_1024">; - -let DecoderMethod = "decodeOperand_AReg_64" in -defm AISrc_64 : RegInlineOperandAC64<"AReg", "AISrc_64", "_64">; -let DecoderMethod = "decodeOperand_AReg_256" in -defm AISrc_256 : RegInlineOperandAC64<"AReg", "AISrc_256", "_256">; +def AISrc_64_f64 : RegOrF64 <"AReg_64", "OPERAND_REG_INLINE_AC">; +def AISrc_128_f32 : RegOrF32 <"AReg_128", "OPERAND_REG_INLINE_AC">; +def AISrc_128_b32 : RegOrB32 <"AReg_128", "OPERAND_REG_INLINE_AC">; +def AISrc_256_f64 : RegOrF64 <"AReg_256", "OPERAND_REG_INLINE_AC">; +def AISrc_512_f32 : RegOrF32 <"AReg_512", "OPERAND_REG_INLINE_AC">; +def AISrc_512_b32 : RegOrB32 <"AReg_512", "OPERAND_REG_INLINE_AC">; +def AISrc_1024_f32 : RegOrF32 <"AReg_1024", "OPERAND_REG_INLINE_AC">; +def AISrc_1024_b32 : RegOrB32 <"AReg_1024", "OPERAND_REG_INLINE_AC">; diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index bec07d990380..4159dc694c1e 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -161,14 +161,12 @@ bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const { bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const { return isInt<16>(Src.getImm()) && - !TII->isInlineConstant(*Src.getParent(), - Src.getParent()->getOperandNo(&Src)); + !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo()); } bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const { return isUInt<16>(Src.getImm()) && - !TII->isInlineConstant(*Src.getParent(), - Src.getParent()->getOperandNo(&Src)); + !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo()); } bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src, @@ -310,7 +308,10 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { unsigned NextVgpr = 0; bool IsUndef = true; bool IsKill = NewAddrDwords == Info->VAddrDwords; - for (unsigned Idx = 0; Idx < Info->VAddrOperands; ++Idx) { + const unsigned NSAMaxSize = ST->getNSAMaxSize(); + const bool IsPartialNSA = NewAddrDwords > NSAMaxSize; + const unsigned EndVAddr = IsPartialNSA ? NSAMaxSize : Info->VAddrOperands; + for (unsigned Idx = 0; Idx < EndVAddr; ++Idx) { const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx); unsigned Vgpr = TRI->getHWRegIndex(Op.getReg()); unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32; @@ -363,13 +364,13 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const { MI.getOperand(VAddr0Idx).setIsUndef(IsUndef); MI.getOperand(VAddr0Idx).setIsKill(IsKill); - for (int i = 1; i < Info->VAddrOperands; ++i) + for (unsigned i = 1; i < EndVAddr; ++i) MI.removeOperand(VAddr0Idx + 1); if (ToUntie >= 0) { MI.tieOperands( AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata), - ToUntie - (Info->VAddrOperands - 1)); + ToUntie - (EndVAddr - 1)); } } @@ -475,7 +476,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { } } -/// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals. +/// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals. /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. /// If the inverse of the immediate is legal, use ANDN2, ORN2 or /// XNOR (as a ^ b == ~(a ^ ~b)). @@ -497,7 +498,7 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { if (Opc == AMDGPU::S_AND_B32) { if (isPowerOf2_32(~Imm)) { - NewImm = countTrailingOnes(Imm); + NewImm = llvm::countr_one(Imm); Opc = AMDGPU::S_BITSET0_B32; } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { NewImm = ~Imm; @@ -505,7 +506,7 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const { } } else if (Opc == AMDGPU::S_OR_B32) { if (isPowerOf2_32(Imm)) { - NewImm = countTrailingZeros(Imm); + NewImm = llvm::countr_zero(Imm); Opc = AMDGPU::S_BITSET1_B32; } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) { NewImm = ~Imm; diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 4d6669f8f94d..3143d437e370 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -158,10 +158,11 @@ private: MachinePostDominatorTree *PDT; unsigned AndOpc; + unsigned AndTermOpc; unsigned AndN2Opc; unsigned XorOpc; unsigned AndSaveExecOpc; - unsigned OrSaveExecOpc; + unsigned AndSaveExecTermOpc; unsigned WQMOpc; Register Exec; Register LiveMaskReg; @@ -380,8 +381,8 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR, if (Reg.isVirtual()) { // Iterate over all operands to find relevant definitions bool HasDef = false; - for (const MachineOperand &Op : MI->operands()) { - if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg)) + for (const MachineOperand &Op : MI->all_defs()) { + if (Op.getReg() != Reg) continue; // Compute lanes defined and overlap with use @@ -453,14 +454,13 @@ void SIWholeQuadMode::markOperand(const MachineInstr &MI, // Handle physical registers that we need to track; this is mostly relevant // for VCC, which can appear as the (implicit) input of a uniform branch, // e.g. when a loop counter is stored in a VGPR. - for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid(); - ++RegUnit) { - LiveRange &LR = LIS->getRegUnit(*RegUnit); + for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) { + LiveRange &LR = LIS->getRegUnit(Unit); const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); if (!Value) continue; - markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist); + markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist); } } } @@ -471,11 +471,8 @@ void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": " << MI); - for (const MachineOperand &Use : MI.uses()) { - if (!Use.isReg() || !Use.isUse()) - continue; + for (const MachineOperand &Use : MI.all_uses()) markOperand(MI, Use, Flag, Worklist); - } } // Scan instructions to determine which ones require an Exact execmask and @@ -1139,7 +1136,7 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( return PreferLast ? Last : First; LiveRange &LR = - LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI)); + LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin()); auto MBBE = MBB.end(); SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First) : LIS->getMBBEndIdx(&MBB); @@ -1185,11 +1182,9 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( // does not need to be preserved. while (MBBI != Last) { bool IsExecDef = false; - for (const MachineOperand &MO : MBBI->operands()) { - if (MO.isReg() && MO.isDef()) { - IsExecDef |= - MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC; - } + for (const MachineOperand &MO : MBBI->all_defs()) { + IsExecDef |= + MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC; } if (!IsExecDef) break; @@ -1206,13 +1201,25 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion( void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, Register SaveWQM) { + bool IsTerminator = Before == MBB.end(); + if (!IsTerminator) { + auto FirstTerm = MBB.getFirstTerminator(); + if (FirstTerm != MBB.end()) { + SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm); + SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before); + IsTerminator = BeforeIdx > FirstTermIdx; + } + } + MachineInstr *MI; if (SaveWQM) { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM) + unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc; + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM) .addReg(LiveMaskReg); } else { - MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec) + unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc; + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec) .addReg(Exec) .addReg(LiveMaskReg); } @@ -1365,7 +1372,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) { Needs = StateExact | StateWQM | StateStrict; } - if (MI.isTerminator() && OutNeeds == StateExact) + // Exact mode exit can occur in terminators, but must be before branches. + if (MI.isBranch() && OutNeeds == StateExact) Needs = StateExact; ++Next; @@ -1539,7 +1547,11 @@ void SIWholeQuadMode::lowerCopyInstrs() { assert(MI->getNumExplicitOperands() == 2); } - MI->setDesc(TII->get(AMDGPU::COPY)); + unsigned CopyOp = MI->getOperand(1).isReg() + ? (unsigned)AMDGPU::COPY + : TII->getMovOpcode(TRI->getRegClassForOperandReg( + *MRI, MI->getOperand(0))); + MI->setDesc(TII->get(CopyOp)); } } @@ -1587,18 +1599,20 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { if (ST->isWave32()) { AndOpc = AMDGPU::S_AND_B32; + AndTermOpc = AMDGPU::S_AND_B32_term; AndN2Opc = AMDGPU::S_ANDN2_B32; XorOpc = AMDGPU::S_XOR_B32; AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32; - OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32; + AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term; WQMOpc = AMDGPU::S_WQM_B32; Exec = AMDGPU::EXEC_LO; } else { AndOpc = AMDGPU::S_AND_B64; + AndTermOpc = AMDGPU::S_AND_B64_term; AndN2Opc = AMDGPU::S_ANDN2_B64; XorOpc = AMDGPU::S_XOR_B64; AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64; - OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64; + AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term; WQMOpc = AMDGPU::S_WQM_B64; Exec = AMDGPU::EXEC; } diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index f271f6d42857..7ca685a0cc5d 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -6,22 +6,12 @@ // //===----------------------------------------------------------------------===// -def smrd_offset_8 : NamedOperandU32<"SMRDOffset8", - NamedMatchClass<"SMRDOffset8">> { - let OperandType = "OPERAND_IMMEDIATE"; -} - -class SMEMOffset : NamedOperandU32<"SMEMOffset", - NamedMatchClass<"SMEMOffset">> { - let OperandType = "OPERAND_IMMEDIATE"; - let EncoderMethod = "getSMEMOffsetEncoding"; - let DecoderMethod = "decodeSMEMOffset"; -} +def smrd_offset_8 : ImmOperand<i32, "SMRDOffset8", 1>; -def smem_offset : SMEMOffset; - -def smem_offset_mod : SMEMOffset { - let PrintMethod = "printSMEMOffsetMod"; +let EncoderMethod = "getSMEMOffsetEncoding", + DecoderMethod = "decodeSMEMOffset" in { +def smem_offset : ImmOperand<i32, "SMEMOffset", 1>; +def smem_offset_mod : NamedIntOperand<i32, "offset", "SMEMOffsetMod">; } //===----------------------------------------------------------------------===// @@ -124,6 +114,7 @@ class SM_Load_Pseudo <string opName, RegisterClass baseClass, " $sdst, $sbase, " # offsets.Asm # "$cpol", []> { RegisterClass BaseClass = baseClass; let mayLoad = 1; + let isReMaterializable = 1; let mayStore = 0; let has_glc = 1; let has_dlc = 1; @@ -138,7 +129,6 @@ class SM_Store_Pseudo <string opName, RegisterClass baseClass, offsets.Ins, (ins CPol:$cpol)), " $sdata, $sbase, " # offsets.Asm # "$cpol"> { RegisterClass BaseClass = baseClass; - RegisterClass SrcClass = srcClass; let mayLoad = 0; let mayStore = 1; let has_glc = 1; @@ -163,23 +153,24 @@ class SM_Discard_Pseudo <string opName, OffsetMode offsets> let PseudoInstr = opName # offsets.Variant; } -multiclass SM_Pseudo_Loads<string opName, - RegisterClass baseClass, +multiclass SM_Pseudo_Loads<RegisterClass baseClass, RegisterClass dstClass> { + defvar opName = !tolower(NAME); def _IMM : SM_Load_Pseudo <opName, baseClass, dstClass, IMM_Offset>; def _SGPR : SM_Load_Pseudo <opName, baseClass, dstClass, SGPR_Offset>; def _SGPR_IMM : SM_Load_Pseudo <opName, baseClass, dstClass, SGPR_IMM_Offset>; } -multiclass SM_Pseudo_Stores<string opName, - RegisterClass baseClass, - RegisterClass srcClass> { +multiclass SM_Pseudo_Stores<RegisterClass baseClass, + RegisterClass srcClass> { + defvar opName = !tolower(NAME); def _IMM : SM_Store_Pseudo <opName, baseClass, srcClass, IMM_Offset>; def _SGPR : SM_Store_Pseudo <opName, baseClass, srcClass, SGPR_Offset>; def _SGPR_IMM : SM_Store_Pseudo <opName, baseClass, srcClass, SGPR_IMM_Offset>; } -multiclass SM_Pseudo_Discards<string opName> { +multiclass SM_Pseudo_Discards { + defvar opName = !tolower(NAME); def _IMM : SM_Discard_Pseudo <opName, IMM_Offset>; def _SGPR : SM_Discard_Pseudo <opName, SGPR_Offset>; def _SGPR_IMM : SM_Discard_Pseudo <opName, SGPR_IMM_Offset>; @@ -204,7 +195,8 @@ class SM_Inval_Pseudo <string opName, SDPatternOperator node = null_frag> : SM_P let has_sbase = 0; } -multiclass SM_Pseudo_Probe<string opName, RegisterClass baseClass> { +multiclass SM_Pseudo_Probe<RegisterClass baseClass> { + defvar opName = !tolower(NAME); def _IMM : SM_Probe_Pseudo <opName, baseClass, IMM_Offset>; def _SGPR : SM_Probe_Pseudo <opName, baseClass, SGPR_Offset>; def _SGPR_IMM : SM_Probe_Pseudo <opName, baseClass, SGPR_IMM_Offset>; @@ -270,9 +262,9 @@ class SM_Pseudo_Atomic<string opName, let DisableEncoding = !if(isRet, "$sdata", ""); } -multiclass SM_Pseudo_Atomics<string opName, - RegisterClass baseClass, +multiclass SM_Pseudo_Atomics<RegisterClass baseClass, RegisterClass dataClass> { + defvar opName = !tolower(NAME); def _IMM : SM_Pseudo_Atomic <opName, baseClass, dataClass, IMM_Offset, 0>; def _SGPR : SM_Pseudo_Atomic <opName, baseClass, dataClass, SGPR_Offset, 0>; def _SGPR_IMM : SM_Pseudo_Atomic <opName, baseClass, dataClass, SGPR_IMM_Offset, 0>; @@ -291,53 +283,31 @@ multiclass SM_Pseudo_Atomics<string opName, // XXX - SMEM instructions do not allow exec for data operand, but // does sdst for SMRD on SI/CI? -defm S_LOAD_DWORD : SM_Pseudo_Loads <"s_load_dword", SReg_64, SReg_32_XM0_XEXEC>; -defm S_LOAD_DWORDX2 : SM_Pseudo_Loads <"s_load_dwordx2", SReg_64, SReg_64_XEXEC>; -defm S_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_load_dwordx4", SReg_64, SReg_128>; -defm S_LOAD_DWORDX8 : SM_Pseudo_Loads <"s_load_dwordx8", SReg_64, SReg_256>; -defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <"s_load_dwordx16", SReg_64, SReg_512>; +defm S_LOAD_DWORD : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>; +defm S_LOAD_DWORDX2 : SM_Pseudo_Loads <SReg_64, SReg_64_XEXEC>; +defm S_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_64, SReg_128>; +defm S_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_64, SReg_256>; +defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_64, SReg_512>; let is_buffer = 1 in { -defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads < - "s_buffer_load_dword", SReg_128, SReg_32_XM0_XEXEC ->; - +defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; // FIXME: exec_lo/exec_hi appear to be allowed for SMRD loads on // SI/CI, bit disallowed for SMEM on VI. -defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads < - "s_buffer_load_dwordx2", SReg_128, SReg_64_XEXEC ->; - -defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads < - "s_buffer_load_dwordx4", SReg_128, SReg_128 ->; - -defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads < - "s_buffer_load_dwordx8", SReg_128, SReg_256 ->; - -defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads < - "s_buffer_load_dwordx16", SReg_128, SReg_512 ->; +defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads <SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_128, SReg_128>; +defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_128, SReg_256>; +defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_128, SReg_512>; } let SubtargetPredicate = HasScalarStores in { -defm S_STORE_DWORD : SM_Pseudo_Stores <"s_store_dword", SReg_64, SReg_32_XM0_XEXEC>; -defm S_STORE_DWORDX2 : SM_Pseudo_Stores <"s_store_dwordx2", SReg_64, SReg_64_XEXEC>; -defm S_STORE_DWORDX4 : SM_Pseudo_Stores <"s_store_dwordx4", SReg_64, SReg_128>; +defm S_STORE_DWORD : SM_Pseudo_Stores <SReg_64, SReg_32_XM0_XEXEC>; +defm S_STORE_DWORDX2 : SM_Pseudo_Stores <SReg_64, SReg_64_XEXEC>; +defm S_STORE_DWORDX4 : SM_Pseudo_Stores <SReg_64, SReg_128>; let is_buffer = 1 in { -defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores < - "s_buffer_store_dword", SReg_128, SReg_32_XM0_XEXEC ->; - -defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores < - "s_buffer_store_dwordx2", SReg_128, SReg_64_XEXEC ->; - -defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores < - "s_buffer_store_dwordx4", SReg_128, SReg_128 ->; +defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores <SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores <SReg_128, SReg_128>; } } // End SubtargetPredicate = HasScalarStores @@ -355,9 +325,9 @@ def S_DCACHE_WB : SM_Inval_Pseudo <"s_dcache_wb", int_amdgcn_s_dcache_wb>; def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>; } // End OtherPredicates = [HasScalarStores] -defm S_ATC_PROBE : SM_Pseudo_Probe <"s_atc_probe", SReg_64>; +defm S_ATC_PROBE : SM_Pseudo_Probe <SReg_64>; let is_buffer = 1 in { -defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <"s_atc_probe_buffer", SReg_128>; +defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <SReg_128>; } } // SubtargetPredicate = isGFX8Plus @@ -371,80 +341,80 @@ def S_GET_WAVEID_IN_WORKGROUP : SM_WaveId_Pseudo <"s_get_waveid_in_workgroup", i let SubtargetPredicate = HasScalarFlatScratchInsts, Uses = [FLAT_SCR] in { -defm S_SCRATCH_LOAD_DWORD : SM_Pseudo_Loads <"s_scratch_load_dword", SReg_64, SReg_32_XM0_XEXEC>; -defm S_SCRATCH_LOAD_DWORDX2 : SM_Pseudo_Loads <"s_scratch_load_dwordx2", SReg_64, SReg_64_XEXEC>; -defm S_SCRATCH_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_scratch_load_dwordx4", SReg_64, SReg_128>; +defm S_SCRATCH_LOAD_DWORD : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>; +defm S_SCRATCH_LOAD_DWORDX2 : SM_Pseudo_Loads <SReg_64, SReg_64_XEXEC>; +defm S_SCRATCH_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_64, SReg_128>; -defm S_SCRATCH_STORE_DWORD : SM_Pseudo_Stores <"s_scratch_store_dword", SReg_64, SReg_32_XM0_XEXEC>; -defm S_SCRATCH_STORE_DWORDX2 : SM_Pseudo_Stores <"s_scratch_store_dwordx2", SReg_64, SReg_64_XEXEC>; -defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores <"s_scratch_store_dwordx4", SReg_64, SReg_128>; +defm S_SCRATCH_STORE_DWORD : SM_Pseudo_Stores <SReg_64, SReg_32_XM0_XEXEC>; +defm S_SCRATCH_STORE_DWORDX2 : SM_Pseudo_Stores <SReg_64, SReg_64_XEXEC>; +defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores <SReg_64, SReg_128>; } // SubtargetPredicate = HasScalarFlatScratchInsts let SubtargetPredicate = HasScalarAtomics in { let is_buffer = 1 in { -defm S_BUFFER_ATOMIC_SWAP : SM_Pseudo_Atomics <"s_buffer_atomic_swap", SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <"s_buffer_atomic_cmpswap", SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_ADD : SM_Pseudo_Atomics <"s_buffer_atomic_add", SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_SUB : SM_Pseudo_Atomics <"s_buffer_atomic_sub", SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_SMIN : SM_Pseudo_Atomics <"s_buffer_atomic_smin", SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_UMIN : SM_Pseudo_Atomics <"s_buffer_atomic_umin", SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_SMAX : SM_Pseudo_Atomics <"s_buffer_atomic_smax", SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_UMAX : SM_Pseudo_Atomics <"s_buffer_atomic_umax", SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_AND : SM_Pseudo_Atomics <"s_buffer_atomic_and", SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_OR : SM_Pseudo_Atomics <"s_buffer_atomic_or", SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_XOR : SM_Pseudo_Atomics <"s_buffer_atomic_xor", SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_INC : SM_Pseudo_Atomics <"s_buffer_atomic_inc", SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_DEC : SM_Pseudo_Atomics <"s_buffer_atomic_dec", SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_SWAP : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_ADD : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_SUB : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_SMIN : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_UMIN : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_SMAX : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_UMAX : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_AND : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_OR : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_XOR : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_INC : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_DEC : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>; -defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_swap_x2", SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_cmpswap_x2", SReg_128, SReg_128>; -defm S_BUFFER_ATOMIC_ADD_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_add_x2", SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_SUB_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_sub_x2", SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_smin_x2", SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_umin_x2", SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_smax_x2", SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_umax_x2", SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_AND_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_and_x2", SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_OR_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_or_x2", SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_XOR_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_xor_x2", SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_INC_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_inc_x2", SReg_128, SReg_64_XEXEC>; -defm S_BUFFER_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_dec_x2", SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics <SReg_128, SReg_128>; +defm S_BUFFER_ATOMIC_ADD_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_SUB_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_AND_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_OR_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_XOR_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_INC_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>; } -defm S_ATOMIC_SWAP : SM_Pseudo_Atomics <"s_atomic_swap", SReg_64, SReg_32_XM0_XEXEC>; -defm S_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <"s_atomic_cmpswap", SReg_64, SReg_64_XEXEC>; -defm S_ATOMIC_ADD : SM_Pseudo_Atomics <"s_atomic_add", SReg_64, SReg_32_XM0_XEXEC>; -defm S_ATOMIC_SUB : SM_Pseudo_Atomics <"s_atomic_sub", SReg_64, SReg_32_XM0_XEXEC>; -defm S_ATOMIC_SMIN : SM_Pseudo_Atomics <"s_atomic_smin", SReg_64, SReg_32_XM0_XEXEC>; -defm S_ATOMIC_UMIN : SM_Pseudo_Atomics <"s_atomic_umin", SReg_64, SReg_32_XM0_XEXEC>; -defm S_ATOMIC_SMAX : SM_Pseudo_Atomics <"s_atomic_smax", SReg_64, SReg_32_XM0_XEXEC>; -defm S_ATOMIC_UMAX : SM_Pseudo_Atomics <"s_atomic_umax", SReg_64, SReg_32_XM0_XEXEC>; -defm S_ATOMIC_AND : SM_Pseudo_Atomics <"s_atomic_and", SReg_64, SReg_32_XM0_XEXEC>; -defm S_ATOMIC_OR : SM_Pseudo_Atomics <"s_atomic_or", SReg_64, SReg_32_XM0_XEXEC>; -defm S_ATOMIC_XOR : SM_Pseudo_Atomics <"s_atomic_xor", SReg_64, SReg_32_XM0_XEXEC>; -defm S_ATOMIC_INC : SM_Pseudo_Atomics <"s_atomic_inc", SReg_64, SReg_32_XM0_XEXEC>; -defm S_ATOMIC_DEC : SM_Pseudo_Atomics <"s_atomic_dec", SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_SWAP : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_ADD : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_SUB : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_SMIN : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_UMIN : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_SMAX : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_UMAX : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_AND : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_OR : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_XOR : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_INC : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_DEC : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>; -defm S_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics <"s_atomic_swap_x2", SReg_64, SReg_64_XEXEC>; -defm S_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics <"s_atomic_cmpswap_x2", SReg_64, SReg_128>; -defm S_ATOMIC_ADD_X2 : SM_Pseudo_Atomics <"s_atomic_add_x2", SReg_64, SReg_64_XEXEC>; -defm S_ATOMIC_SUB_X2 : SM_Pseudo_Atomics <"s_atomic_sub_x2", SReg_64, SReg_64_XEXEC>; -defm S_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics <"s_atomic_smin_x2", SReg_64, SReg_64_XEXEC>; -defm S_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics <"s_atomic_umin_x2", SReg_64, SReg_64_XEXEC>; -defm S_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics <"s_atomic_smax_x2", SReg_64, SReg_64_XEXEC>; -defm S_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics <"s_atomic_umax_x2", SReg_64, SReg_64_XEXEC>; -defm S_ATOMIC_AND_X2 : SM_Pseudo_Atomics <"s_atomic_and_x2", SReg_64, SReg_64_XEXEC>; -defm S_ATOMIC_OR_X2 : SM_Pseudo_Atomics <"s_atomic_or_x2", SReg_64, SReg_64_XEXEC>; -defm S_ATOMIC_XOR_X2 : SM_Pseudo_Atomics <"s_atomic_xor_x2", SReg_64, SReg_64_XEXEC>; -defm S_ATOMIC_INC_X2 : SM_Pseudo_Atomics <"s_atomic_inc_x2", SReg_64, SReg_64_XEXEC>; -defm S_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <"s_atomic_dec_x2", SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics <SReg_64, SReg_128>; +defm S_ATOMIC_ADD_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_SUB_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_AND_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_OR_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_XOR_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_INC_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>; } // let SubtargetPredicate = HasScalarAtomics let SubtargetPredicate = HasScalarAtomics in { -defm S_DCACHE_DISCARD : SM_Pseudo_Discards <"s_dcache_discard">; -defm S_DCACHE_DISCARD_X2 : SM_Pseudo_Discards <"s_dcache_discard_x2">; +defm S_DCACHE_DISCARD : SM_Pseudo_Discards; +defm S_DCACHE_DISCARD_X2 : SM_Pseudo_Discards; } //===----------------------------------------------------------------------===// @@ -471,30 +441,27 @@ class SMRD_Real_si <bits<5> op, SM_Pseudo ps> let Inst{31-27} = 0x18; //encoding } -multiclass SM_Real_Loads_si<bits<5> op, string ps, - SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM), - SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> { - +multiclass SM_Real_Loads_si<bits<5> op> { + defvar ps = NAME; + defvar immPs = !cast<SM_Load_Pseudo>(ps#_IMM); def _IMM_si : SMRD_Real_si <op, immPs> { let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, CPol:$cpol); } - def _SGPR_si : SMRD_Real_si <op, sgprPs> { - let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$soffset, CPol:$cpol); - } - + defvar sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR); + def _SGPR_si : SMRD_Real_si <op, sgprPs>; } -defm S_LOAD_DWORD : SM_Real_Loads_si <0x00, "S_LOAD_DWORD">; -defm S_LOAD_DWORDX2 : SM_Real_Loads_si <0x01, "S_LOAD_DWORDX2">; -defm S_LOAD_DWORDX4 : SM_Real_Loads_si <0x02, "S_LOAD_DWORDX4">; -defm S_LOAD_DWORDX8 : SM_Real_Loads_si <0x03, "S_LOAD_DWORDX8">; -defm S_LOAD_DWORDX16 : SM_Real_Loads_si <0x04, "S_LOAD_DWORDX16">; -defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_si <0x08, "S_BUFFER_LOAD_DWORD">; -defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_si <0x09, "S_BUFFER_LOAD_DWORDX2">; -defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_si <0x0a, "S_BUFFER_LOAD_DWORDX4">; -defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_si <0x0b, "S_BUFFER_LOAD_DWORDX8">; -defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_si <0x0c, "S_BUFFER_LOAD_DWORDX16">; +defm S_LOAD_DWORD : SM_Real_Loads_si <0x00>; +defm S_LOAD_DWORDX2 : SM_Real_Loads_si <0x01>; +defm S_LOAD_DWORDX4 : SM_Real_Loads_si <0x02>; +defm S_LOAD_DWORDX8 : SM_Real_Loads_si <0x03>; +defm S_LOAD_DWORDX16 : SM_Real_Loads_si <0x04>; +defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_si <0x08>; +defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_si <0x09>; +defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_si <0x0a>; +defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_si <0x0b>; +defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_si <0x0c>; def S_MEMTIME_si : SMRD_Real_si <0x1e, S_MEMTIME>; def S_DCACHE_INV_si : SMRD_Real_si <0x1f, S_DCACHE_INV>; @@ -548,11 +515,8 @@ class SMEM_Real_vi <bits<8> op, SM_Pseudo ps> soffset{6-0}, ?); } -class SMEM_Real_Load_vi<bits<8> op, string ps, OffsetMode offsets> - : SMEM_Real_vi<op, !cast<SM_Pseudo>(ps # offsets.Variant)> { - RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass; - let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol)); -} +class SMEM_Real_Load_vi<bits<8> op, string ps> + : SMEM_Real_vi<op, !cast<SM_Pseudo>(ps)>; // The alternative GFX9 SGPR encoding using soffset to encode the // offset register. Not available in assembler and goes to the GFX9 @@ -565,13 +529,14 @@ class SMEM_Real_SGPR_alt_gfx9 { string AsmVariantName = "NonParsable"; } -multiclass SM_Real_Loads_vi<bits<8> op, string ps> { - def _IMM_vi : SMEM_Real_Load_vi <op, ps, IMM_Offset>; - def _SGPR_vi : SMEM_Real_Load_vi <op, ps, SGPR_Offset>; - def _SGPR_alt_gfx9 : SMEM_Real_Load_vi <op, ps, SGPR_Offset>, +multiclass SM_Real_Loads_vi<bits<8> op> { + defvar ps = NAME; + def _IMM_vi : SMEM_Real_Load_vi <op, ps#"_IMM">; + def _SGPR_vi : SMEM_Real_Load_vi <op, ps#"_SGPR">; + def _SGPR_alt_gfx9 : SMEM_Real_Load_vi <op, ps#"_SGPR">, SMEM_Real_SGPR_alt_gfx9; let IsGFX9SpecificEncoding = true in - def _SGPR_IMM_gfx9 : SMEM_Real_Load_vi <op, ps, SGPR_IMM_Offset>; + def _SGPR_IMM_gfx9 : SMEM_Real_Load_vi <op, ps#"_SGPR_IMM">; } class SMEM_Real_Store_Base_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps> { @@ -582,24 +547,21 @@ class SMEM_Real_Store_Base_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps> let Inst{12-6} = !if(ps.has_sdst, sdata{6-0}, ?); } -class SMEM_Real_Store_vi <bits<8> op, string ps, OffsetMode offsets> - : SMEM_Real_Store_Base_vi <op, !cast<SM_Pseudo>(ps # offsets.Variant)> { - RegisterClass SrcClass = !cast<SM_Store_Pseudo>(ps # offsets.Variant).SrcClass; - RegisterClass BaseClass = !cast<SM_Store_Pseudo>(ps # offsets.Variant).BaseClass; - let InOperandList = !con((ins SrcClass:$sdata, BaseClass:$sbase), - offsets.Ins, (ins CPol:$cpol)); -} +class SMEM_Real_Store_vi <bits<8> op, string ps> + : SMEM_Real_Store_Base_vi <op, !cast<SM_Pseudo>(ps)>; -multiclass SM_Real_Stores_vi<bits<8> op, string ps> { - def _IMM_vi : SMEM_Real_Store_vi <op, ps, IMM_Offset>; - def _SGPR_vi : SMEM_Real_Store_vi <op, ps, SGPR_Offset>; - def _SGPR_alt_gfx9 : SMEM_Real_Store_vi <op, ps, SGPR_Offset>, +multiclass SM_Real_Stores_vi<bits<8> op> { + defvar ps = NAME; + def _IMM_vi : SMEM_Real_Store_vi <op, ps#"_IMM">; + def _SGPR_vi : SMEM_Real_Store_vi <op, ps#"_SGPR">; + def _SGPR_alt_gfx9 : SMEM_Real_Store_vi <op, ps#"_SGPR">, SMEM_Real_SGPR_alt_gfx9; let IsGFX9SpecificEncoding = true in - def _SGPR_IMM_gfx9 : SMEM_Real_Store_vi <op, ps, SGPR_IMM_Offset>; + def _SGPR_IMM_gfx9 : SMEM_Real_Store_vi <op, ps#"_SGPR_IMM">; } -multiclass SM_Real_Probe_vi<bits<8> op, string ps> { +multiclass SM_Real_Probe_vi<bits<8> op> { + defvar ps = NAME; def _IMM_vi : SMEM_Real_Store_Base_vi <op, !cast<SM_Probe_Pseudo>(ps#_IMM)>; def _SGPR_vi : SMEM_Real_Store_Base_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>; def _SGPR_alt_gfx9 @@ -610,24 +572,24 @@ multiclass SM_Real_Probe_vi<bits<8> op, string ps> { : SMEM_Real_Store_Base_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR_IMM)>; } -defm S_LOAD_DWORD : SM_Real_Loads_vi <0x00, "S_LOAD_DWORD">; -defm S_LOAD_DWORDX2 : SM_Real_Loads_vi <0x01, "S_LOAD_DWORDX2">; -defm S_LOAD_DWORDX4 : SM_Real_Loads_vi <0x02, "S_LOAD_DWORDX4">; -defm S_LOAD_DWORDX8 : SM_Real_Loads_vi <0x03, "S_LOAD_DWORDX8">; -defm S_LOAD_DWORDX16 : SM_Real_Loads_vi <0x04, "S_LOAD_DWORDX16">; -defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_vi <0x08, "S_BUFFER_LOAD_DWORD">; -defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_vi <0x09, "S_BUFFER_LOAD_DWORDX2">; -defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_vi <0x0a, "S_BUFFER_LOAD_DWORDX4">; -defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_vi <0x0b, "S_BUFFER_LOAD_DWORDX8">; -defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_vi <0x0c, "S_BUFFER_LOAD_DWORDX16">; +defm S_LOAD_DWORD : SM_Real_Loads_vi <0x00>; +defm S_LOAD_DWORDX2 : SM_Real_Loads_vi <0x01>; +defm S_LOAD_DWORDX4 : SM_Real_Loads_vi <0x02>; +defm S_LOAD_DWORDX8 : SM_Real_Loads_vi <0x03>; +defm S_LOAD_DWORDX16 : SM_Real_Loads_vi <0x04>; +defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_vi <0x08>; +defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_vi <0x09>; +defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_vi <0x0a>; +defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_vi <0x0b>; +defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_vi <0x0c>; -defm S_STORE_DWORD : SM_Real_Stores_vi <0x10, "S_STORE_DWORD">; -defm S_STORE_DWORDX2 : SM_Real_Stores_vi <0x11, "S_STORE_DWORDX2">; -defm S_STORE_DWORDX4 : SM_Real_Stores_vi <0x12, "S_STORE_DWORDX4">; +defm S_STORE_DWORD : SM_Real_Stores_vi <0x10>; +defm S_STORE_DWORDX2 : SM_Real_Stores_vi <0x11>; +defm S_STORE_DWORDX4 : SM_Real_Stores_vi <0x12>; -defm S_BUFFER_STORE_DWORD : SM_Real_Stores_vi <0x18, "S_BUFFER_STORE_DWORD">; -defm S_BUFFER_STORE_DWORDX2 : SM_Real_Stores_vi <0x19, "S_BUFFER_STORE_DWORDX2">; -defm S_BUFFER_STORE_DWORDX4 : SM_Real_Stores_vi <0x1a, "S_BUFFER_STORE_DWORDX4">; +defm S_BUFFER_STORE_DWORD : SM_Real_Stores_vi <0x18>; +defm S_BUFFER_STORE_DWORDX2 : SM_Real_Stores_vi <0x19>; +defm S_BUFFER_STORE_DWORDX4 : SM_Real_Stores_vi <0x1a>; // These instructions use same encoding def S_DCACHE_INV_vi : SMEM_Real_vi <0x20, S_DCACHE_INV>; @@ -637,16 +599,16 @@ def S_DCACHE_WB_VOL_vi : SMEM_Real_vi <0x23, S_DCACHE_WB_VOL>; def S_MEMTIME_vi : SMEM_Real_vi <0x24, S_MEMTIME>; def S_MEMREALTIME_vi : SMEM_Real_vi <0x25, S_MEMREALTIME>; -defm S_SCRATCH_LOAD_DWORD : SM_Real_Loads_vi <0x05, "S_SCRATCH_LOAD_DWORD">; -defm S_SCRATCH_LOAD_DWORDX2 : SM_Real_Loads_vi <0x06, "S_SCRATCH_LOAD_DWORDX2">; -defm S_SCRATCH_LOAD_DWORDX4 : SM_Real_Loads_vi <0x07, "S_SCRATCH_LOAD_DWORDX4">; +defm S_SCRATCH_LOAD_DWORD : SM_Real_Loads_vi <0x05>; +defm S_SCRATCH_LOAD_DWORDX2 : SM_Real_Loads_vi <0x06>; +defm S_SCRATCH_LOAD_DWORDX4 : SM_Real_Loads_vi <0x07>; -defm S_SCRATCH_STORE_DWORD : SM_Real_Stores_vi <0x15, "S_SCRATCH_STORE_DWORD">; -defm S_SCRATCH_STORE_DWORDX2 : SM_Real_Stores_vi <0x16, "S_SCRATCH_STORE_DWORDX2">; -defm S_SCRATCH_STORE_DWORDX4 : SM_Real_Stores_vi <0x17, "S_SCRATCH_STORE_DWORDX4">; +defm S_SCRATCH_STORE_DWORD : SM_Real_Stores_vi <0x15>; +defm S_SCRATCH_STORE_DWORDX2 : SM_Real_Stores_vi <0x16>; +defm S_SCRATCH_STORE_DWORDX4 : SM_Real_Stores_vi <0x17>; -defm S_ATC_PROBE : SM_Real_Probe_vi <0x26, "S_ATC_PROBE">; -defm S_ATC_PROBE_BUFFER : SM_Real_Probe_vi <0x27, "S_ATC_PROBE_BUFFER">; +defm S_ATC_PROBE : SM_Real_Probe_vi <0x26>; +defm S_ATC_PROBE_BUFFER : SM_Real_Probe_vi <0x27>; //===----------------------------------------------------------------------===// // GFX9 @@ -665,7 +627,8 @@ class SMEM_Atomic_Real_vi <bits<8> op, SM_Atomic_Pseudo ps> let Inst{12-6} = !if(ps.glc, sdst{6-0}, sdata{6-0}); } -multiclass SM_Real_Atomics_vi<bits<8> op, string ps> { +multiclass SM_Real_Atomics_vi<bits<8> op> { + defvar ps = NAME; def _IMM_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_IMM)>; def _SGPR_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>; def _SGPR_alt_gfx9 @@ -684,63 +647,64 @@ multiclass SM_Real_Atomics_vi<bits<8> op, string ps> { : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_IMM_RTN)>; } -defm S_BUFFER_ATOMIC_SWAP : SM_Real_Atomics_vi <0x40, "S_BUFFER_ATOMIC_SWAP">; -defm S_BUFFER_ATOMIC_CMPSWAP : SM_Real_Atomics_vi <0x41, "S_BUFFER_ATOMIC_CMPSWAP">; -defm S_BUFFER_ATOMIC_ADD : SM_Real_Atomics_vi <0x42, "S_BUFFER_ATOMIC_ADD">; -defm S_BUFFER_ATOMIC_SUB : SM_Real_Atomics_vi <0x43, "S_BUFFER_ATOMIC_SUB">; -defm S_BUFFER_ATOMIC_SMIN : SM_Real_Atomics_vi <0x44, "S_BUFFER_ATOMIC_SMIN">; -defm S_BUFFER_ATOMIC_UMIN : SM_Real_Atomics_vi <0x45, "S_BUFFER_ATOMIC_UMIN">; -defm S_BUFFER_ATOMIC_SMAX : SM_Real_Atomics_vi <0x46, "S_BUFFER_ATOMIC_SMAX">; -defm S_BUFFER_ATOMIC_UMAX : SM_Real_Atomics_vi <0x47, "S_BUFFER_ATOMIC_UMAX">; -defm S_BUFFER_ATOMIC_AND : SM_Real_Atomics_vi <0x48, "S_BUFFER_ATOMIC_AND">; -defm S_BUFFER_ATOMIC_OR : SM_Real_Atomics_vi <0x49, "S_BUFFER_ATOMIC_OR">; -defm S_BUFFER_ATOMIC_XOR : SM_Real_Atomics_vi <0x4a, "S_BUFFER_ATOMIC_XOR">; -defm S_BUFFER_ATOMIC_INC : SM_Real_Atomics_vi <0x4b, "S_BUFFER_ATOMIC_INC">; -defm S_BUFFER_ATOMIC_DEC : SM_Real_Atomics_vi <0x4c, "S_BUFFER_ATOMIC_DEC">; +defm S_BUFFER_ATOMIC_SWAP : SM_Real_Atomics_vi <0x40>; +defm S_BUFFER_ATOMIC_CMPSWAP : SM_Real_Atomics_vi <0x41>; +defm S_BUFFER_ATOMIC_ADD : SM_Real_Atomics_vi <0x42>; +defm S_BUFFER_ATOMIC_SUB : SM_Real_Atomics_vi <0x43>; +defm S_BUFFER_ATOMIC_SMIN : SM_Real_Atomics_vi <0x44>; +defm S_BUFFER_ATOMIC_UMIN : SM_Real_Atomics_vi <0x45>; +defm S_BUFFER_ATOMIC_SMAX : SM_Real_Atomics_vi <0x46>; +defm S_BUFFER_ATOMIC_UMAX : SM_Real_Atomics_vi <0x47>; +defm S_BUFFER_ATOMIC_AND : SM_Real_Atomics_vi <0x48>; +defm S_BUFFER_ATOMIC_OR : SM_Real_Atomics_vi <0x49>; +defm S_BUFFER_ATOMIC_XOR : SM_Real_Atomics_vi <0x4a>; +defm S_BUFFER_ATOMIC_INC : SM_Real_Atomics_vi <0x4b>; +defm S_BUFFER_ATOMIC_DEC : SM_Real_Atomics_vi <0x4c>; -defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Real_Atomics_vi <0x60, "S_BUFFER_ATOMIC_SWAP_X2">; -defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_vi <0x61, "S_BUFFER_ATOMIC_CMPSWAP_X2">; -defm S_BUFFER_ATOMIC_ADD_X2 : SM_Real_Atomics_vi <0x62, "S_BUFFER_ATOMIC_ADD_X2">; -defm S_BUFFER_ATOMIC_SUB_X2 : SM_Real_Atomics_vi <0x63, "S_BUFFER_ATOMIC_SUB_X2">; -defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Real_Atomics_vi <0x64, "S_BUFFER_ATOMIC_SMIN_X2">; -defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Real_Atomics_vi <0x65, "S_BUFFER_ATOMIC_UMIN_X2">; -defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Real_Atomics_vi <0x66, "S_BUFFER_ATOMIC_SMAX_X2">; -defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Real_Atomics_vi <0x67, "S_BUFFER_ATOMIC_UMAX_X2">; -defm S_BUFFER_ATOMIC_AND_X2 : SM_Real_Atomics_vi <0x68, "S_BUFFER_ATOMIC_AND_X2">; -defm S_BUFFER_ATOMIC_OR_X2 : SM_Real_Atomics_vi <0x69, "S_BUFFER_ATOMIC_OR_X2">; -defm S_BUFFER_ATOMIC_XOR_X2 : SM_Real_Atomics_vi <0x6a, "S_BUFFER_ATOMIC_XOR_X2">; -defm S_BUFFER_ATOMIC_INC_X2 : SM_Real_Atomics_vi <0x6b, "S_BUFFER_ATOMIC_INC_X2">; -defm S_BUFFER_ATOMIC_DEC_X2 : SM_Real_Atomics_vi <0x6c, "S_BUFFER_ATOMIC_DEC_X2">; +defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Real_Atomics_vi <0x60>; +defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_vi <0x61>; +defm S_BUFFER_ATOMIC_ADD_X2 : SM_Real_Atomics_vi <0x62>; +defm S_BUFFER_ATOMIC_SUB_X2 : SM_Real_Atomics_vi <0x63>; +defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Real_Atomics_vi <0x64>; +defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Real_Atomics_vi <0x65>; +defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Real_Atomics_vi <0x66>; +defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Real_Atomics_vi <0x67>; +defm S_BUFFER_ATOMIC_AND_X2 : SM_Real_Atomics_vi <0x68>; +defm S_BUFFER_ATOMIC_OR_X2 : SM_Real_Atomics_vi <0x69>; +defm S_BUFFER_ATOMIC_XOR_X2 : SM_Real_Atomics_vi <0x6a>; +defm S_BUFFER_ATOMIC_INC_X2 : SM_Real_Atomics_vi <0x6b>; +defm S_BUFFER_ATOMIC_DEC_X2 : SM_Real_Atomics_vi <0x6c>; -defm S_ATOMIC_SWAP : SM_Real_Atomics_vi <0x80, "S_ATOMIC_SWAP">; -defm S_ATOMIC_CMPSWAP : SM_Real_Atomics_vi <0x81, "S_ATOMIC_CMPSWAP">; -defm S_ATOMIC_ADD : SM_Real_Atomics_vi <0x82, "S_ATOMIC_ADD">; -defm S_ATOMIC_SUB : SM_Real_Atomics_vi <0x83, "S_ATOMIC_SUB">; -defm S_ATOMIC_SMIN : SM_Real_Atomics_vi <0x84, "S_ATOMIC_SMIN">; -defm S_ATOMIC_UMIN : SM_Real_Atomics_vi <0x85, "S_ATOMIC_UMIN">; -defm S_ATOMIC_SMAX : SM_Real_Atomics_vi <0x86, "S_ATOMIC_SMAX">; -defm S_ATOMIC_UMAX : SM_Real_Atomics_vi <0x87, "S_ATOMIC_UMAX">; -defm S_ATOMIC_AND : SM_Real_Atomics_vi <0x88, "S_ATOMIC_AND">; -defm S_ATOMIC_OR : SM_Real_Atomics_vi <0x89, "S_ATOMIC_OR">; -defm S_ATOMIC_XOR : SM_Real_Atomics_vi <0x8a, "S_ATOMIC_XOR">; -defm S_ATOMIC_INC : SM_Real_Atomics_vi <0x8b, "S_ATOMIC_INC">; -defm S_ATOMIC_DEC : SM_Real_Atomics_vi <0x8c, "S_ATOMIC_DEC">; +defm S_ATOMIC_SWAP : SM_Real_Atomics_vi <0x80>; +defm S_ATOMIC_CMPSWAP : SM_Real_Atomics_vi <0x81>; +defm S_ATOMIC_ADD : SM_Real_Atomics_vi <0x82>; +defm S_ATOMIC_SUB : SM_Real_Atomics_vi <0x83>; +defm S_ATOMIC_SMIN : SM_Real_Atomics_vi <0x84>; +defm S_ATOMIC_UMIN : SM_Real_Atomics_vi <0x85>; +defm S_ATOMIC_SMAX : SM_Real_Atomics_vi <0x86>; +defm S_ATOMIC_UMAX : SM_Real_Atomics_vi <0x87>; +defm S_ATOMIC_AND : SM_Real_Atomics_vi <0x88>; +defm S_ATOMIC_OR : SM_Real_Atomics_vi <0x89>; +defm S_ATOMIC_XOR : SM_Real_Atomics_vi <0x8a>; +defm S_ATOMIC_INC : SM_Real_Atomics_vi <0x8b>; +defm S_ATOMIC_DEC : SM_Real_Atomics_vi <0x8c>; -defm S_ATOMIC_SWAP_X2 : SM_Real_Atomics_vi <0xa0, "S_ATOMIC_SWAP_X2">; -defm S_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_vi <0xa1, "S_ATOMIC_CMPSWAP_X2">; -defm S_ATOMIC_ADD_X2 : SM_Real_Atomics_vi <0xa2, "S_ATOMIC_ADD_X2">; -defm S_ATOMIC_SUB_X2 : SM_Real_Atomics_vi <0xa3, "S_ATOMIC_SUB_X2">; -defm S_ATOMIC_SMIN_X2 : SM_Real_Atomics_vi <0xa4, "S_ATOMIC_SMIN_X2">; -defm S_ATOMIC_UMIN_X2 : SM_Real_Atomics_vi <0xa5, "S_ATOMIC_UMIN_X2">; -defm S_ATOMIC_SMAX_X2 : SM_Real_Atomics_vi <0xa6, "S_ATOMIC_SMAX_X2">; -defm S_ATOMIC_UMAX_X2 : SM_Real_Atomics_vi <0xa7, "S_ATOMIC_UMAX_X2">; -defm S_ATOMIC_AND_X2 : SM_Real_Atomics_vi <0xa8, "S_ATOMIC_AND_X2">; -defm S_ATOMIC_OR_X2 : SM_Real_Atomics_vi <0xa9, "S_ATOMIC_OR_X2">; -defm S_ATOMIC_XOR_X2 : SM_Real_Atomics_vi <0xaa, "S_ATOMIC_XOR_X2">; -defm S_ATOMIC_INC_X2 : SM_Real_Atomics_vi <0xab, "S_ATOMIC_INC_X2">; -defm S_ATOMIC_DEC_X2 : SM_Real_Atomics_vi <0xac, "S_ATOMIC_DEC_X2">; +defm S_ATOMIC_SWAP_X2 : SM_Real_Atomics_vi <0xa0>; +defm S_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_vi <0xa1>; +defm S_ATOMIC_ADD_X2 : SM_Real_Atomics_vi <0xa2>; +defm S_ATOMIC_SUB_X2 : SM_Real_Atomics_vi <0xa3>; +defm S_ATOMIC_SMIN_X2 : SM_Real_Atomics_vi <0xa4>; +defm S_ATOMIC_UMIN_X2 : SM_Real_Atomics_vi <0xa5>; +defm S_ATOMIC_SMAX_X2 : SM_Real_Atomics_vi <0xa6>; +defm S_ATOMIC_UMAX_X2 : SM_Real_Atomics_vi <0xa7>; +defm S_ATOMIC_AND_X2 : SM_Real_Atomics_vi <0xa8>; +defm S_ATOMIC_OR_X2 : SM_Real_Atomics_vi <0xa9>; +defm S_ATOMIC_XOR_X2 : SM_Real_Atomics_vi <0xaa>; +defm S_ATOMIC_INC_X2 : SM_Real_Atomics_vi <0xab>; +defm S_ATOMIC_DEC_X2 : SM_Real_Atomics_vi <0xac>; -multiclass SM_Real_Discard_vi<bits<8> op, string ps> { +multiclass SM_Real_Discard_vi<bits<8> op> { + defvar ps = NAME; def _IMM_vi : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_IMM)>; def _SGPR_vi : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR)>; def _SGPR_alt_gfx9 : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR)>, @@ -749,17 +713,14 @@ multiclass SM_Real_Discard_vi<bits<8> op, string ps> { def _SGPR_IMM_gfx9 : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR_IMM)>; } -defm S_DCACHE_DISCARD : SM_Real_Discard_vi <0x28, "S_DCACHE_DISCARD">; -defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_vi <0x29, "S_DCACHE_DISCARD_X2">; +defm S_DCACHE_DISCARD : SM_Real_Discard_vi <0x28>; +defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_vi <0x29>; //===----------------------------------------------------------------------===// // CI //===----------------------------------------------------------------------===// -def smrd_literal_offset : NamedOperandU32<"SMRDLiteralOffset", - NamedMatchClass<"SMRDLiteralOffset">> { - let OperandType = "OPERAND_IMMEDIATE"; -} +def smrd_literal_offset : ImmOperand<i32, "SMRDLiteralOffset">; class SMRD_Real_Load_IMM_ci <bits<5> op, SM_Load_Pseudo ps> : SM_Real<ps>, @@ -854,8 +815,14 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> { // 3. SGPR offset def : GCNPat < (smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $soffset, 0)) - >; + (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $soffset, 0))> { + let OtherPredicates = [isNotGFX9Plus]; + } + def : GCNPat < + (smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))> { + let OtherPredicates = [isGFX9Plus]; + } // 4. SGPR+IMM offset def : GCNPat < @@ -891,8 +858,14 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt> { // 3. Offset loaded in an 32bit SGPR def : GCNPat < (SIsbuffer_load v4i32:$sbase, i32:$soffset, timm:$cachepolicy), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$soffset, (extract_cpol $cachepolicy))) - >; + (vt (!cast<SM_Pseudo>(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$soffset, (extract_cpol $cachepolicy)))> { + let OtherPredicates = [isNotGFX9Plus]; + } + def : GCNPat < + (SIsbuffer_load v4i32:$sbase, i32:$soffset, timm:$cachepolicy), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, 0, (extract_cpol $cachepolicy)))> { + let OtherPredicates = [isGFX9Plus]; + } // 4. Offset as an 32-bit SGPR + immediate def : GCNPat < @@ -929,6 +902,8 @@ foreach vt = SReg_512.RegTypes in { defm : SMRD_Pattern <"S_LOAD_DWORDX16", vt>; } +} // End let AddedComplexity = 100 + defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4i32>; @@ -940,7 +915,6 @@ defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2f32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4f32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8f32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>; -} // End let AddedComplexity = 100 let OtherPredicates = [HasSMemTimeInst] in { def : GCNPat < @@ -987,16 +961,14 @@ class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps> let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ?); } -class SMEM_Real_Load_gfx10<bits<8> op, string ps, OffsetMode offsets> - : SMEM_Real_gfx10<op, !cast<SM_Pseudo>(ps # offsets.Variant)> { - RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass; - let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol)); -} +class SMEM_Real_Load_gfx10<bits<8> op, string ps> + : SMEM_Real_gfx10<op, !cast<SM_Pseudo>(ps)>; -multiclass SM_Real_Loads_gfx10<bits<8> op, string ps> { - def _IMM_gfx10 : SMEM_Real_Load_gfx10<op, ps, IMM_Offset>; - def _SGPR_gfx10 : SMEM_Real_Load_gfx10<op, ps, SGPR_Offset>; - def _SGPR_IMM_gfx10 : SMEM_Real_Load_gfx10<op, ps, SGPR_IMM_Offset>; +multiclass SM_Real_Loads_gfx10<bits<8> op> { + defvar ps = NAME; + def _IMM_gfx10 : SMEM_Real_Load_gfx10<op, ps#"_IMM">; + def _SGPR_gfx10 : SMEM_Real_Load_gfx10<op, ps#"_SGPR">; + def _SGPR_IMM_gfx10 : SMEM_Real_Load_gfx10<op, ps#"_SGPR_IMM">; } class SMEM_Real_Store_gfx10<bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx10<op, ps> { @@ -1006,53 +978,48 @@ class SMEM_Real_Store_gfx10<bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx10<op, ps> let Inst{12-6} = !if(ps.has_sdst, sdata{6-0}, ?); } -multiclass SM_Real_Stores_gfx10<bits<8> op, string ps, - SM_Store_Pseudo immPs = !cast<SM_Store_Pseudo>(ps#_IMM), - SM_Store_Pseudo sgprPs = !cast<SM_Store_Pseudo>(ps#_SGPR)> { - def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, immPs> { - let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol); - } +multiclass SM_Real_Stores_gfx10<bits<8> op> { + defvar ps = NAME; + defvar immPs = !cast<SM_Store_Pseudo>(ps#_IMM); + def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, immPs>; - def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, sgprPs> { - let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$soffset, CPol:$cpol); - } + defvar sgprPs = !cast<SM_Store_Pseudo>(ps#_SGPR); + def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, sgprPs>; - def _SGPR_IMM_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Store_Pseudo>(ps#_SGPR_IMM)> { - let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, - SReg_32:$soffset, smem_offset_mod:$offset, CPol:$cpol); - } + defvar sgprImmPs = !cast<SM_Store_Pseudo>(ps#_SGPR_IMM); + def _SGPR_IMM_gfx10 : SMEM_Real_Store_gfx10 <op, sgprImmPs>; } -defm S_LOAD_DWORD : SM_Real_Loads_gfx10<0x000, "S_LOAD_DWORD">; -defm S_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x001, "S_LOAD_DWORDX2">; -defm S_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x002, "S_LOAD_DWORDX4">; -defm S_LOAD_DWORDX8 : SM_Real_Loads_gfx10<0x003, "S_LOAD_DWORDX8">; -defm S_LOAD_DWORDX16 : SM_Real_Loads_gfx10<0x004, "S_LOAD_DWORDX16">; +defm S_LOAD_DWORD : SM_Real_Loads_gfx10<0x000>; +defm S_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x001>; +defm S_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x002>; +defm S_LOAD_DWORDX8 : SM_Real_Loads_gfx10<0x003>; +defm S_LOAD_DWORDX16 : SM_Real_Loads_gfx10<0x004>; let SubtargetPredicate = HasScalarFlatScratchInsts in { -defm S_SCRATCH_LOAD_DWORD : SM_Real_Loads_gfx10<0x005, "S_SCRATCH_LOAD_DWORD">; -defm S_SCRATCH_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x006, "S_SCRATCH_LOAD_DWORDX2">; -defm S_SCRATCH_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x007, "S_SCRATCH_LOAD_DWORDX4">; +defm S_SCRATCH_LOAD_DWORD : SM_Real_Loads_gfx10<0x005>; +defm S_SCRATCH_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x006>; +defm S_SCRATCH_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x007>; } // End SubtargetPredicate = HasScalarFlatScratchInsts -defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_gfx10<0x008, "S_BUFFER_LOAD_DWORD">; -defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x009, "S_BUFFER_LOAD_DWORDX2">; -defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x00a, "S_BUFFER_LOAD_DWORDX4">; -defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_gfx10<0x00b, "S_BUFFER_LOAD_DWORDX8">; -defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_gfx10<0x00c, "S_BUFFER_LOAD_DWORDX16">; +defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_gfx10<0x008>; +defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x009>; +defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x00a>; +defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_gfx10<0x00b>; +defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_gfx10<0x00c>; let SubtargetPredicate = HasScalarStores in { -defm S_STORE_DWORD : SM_Real_Stores_gfx10<0x010, "S_STORE_DWORD">; -defm S_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x011, "S_STORE_DWORDX2">; -defm S_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x012, "S_STORE_DWORDX4">; +defm S_STORE_DWORD : SM_Real_Stores_gfx10<0x010>; +defm S_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x011>; +defm S_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x012>; let OtherPredicates = [HasScalarFlatScratchInsts] in { -defm S_SCRATCH_STORE_DWORD : SM_Real_Stores_gfx10<0x015, "S_SCRATCH_STORE_DWORD">; -defm S_SCRATCH_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x016, "S_SCRATCH_STORE_DWORDX2">; -defm S_SCRATCH_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x017, "S_SCRATCH_STORE_DWORDX4">; +defm S_SCRATCH_STORE_DWORD : SM_Real_Stores_gfx10<0x015>; +defm S_SCRATCH_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x016>; +defm S_SCRATCH_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x017>; } // End OtherPredicates = [HasScalarFlatScratchInsts] -defm S_BUFFER_STORE_DWORD : SM_Real_Stores_gfx10<0x018, "S_BUFFER_STORE_DWORD">; -defm S_BUFFER_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x019, "S_BUFFER_STORE_DWORDX2">; -defm S_BUFFER_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x01a, "S_BUFFER_STORE_DWORDX4">; +defm S_BUFFER_STORE_DWORD : SM_Real_Stores_gfx10<0x018>; +defm S_BUFFER_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x019>; +defm S_BUFFER_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x01a>; } // End SubtargetPredicate = HasScalarStores def S_MEMREALTIME_gfx10 : SMEM_Real_gfx10<0x025, S_MEMREALTIME>; @@ -1065,15 +1032,16 @@ let SubtargetPredicate = HasScalarStores in { def S_DCACHE_WB_gfx10 : SMEM_Real_gfx10<0x021, S_DCACHE_WB>; } // End SubtargetPredicate = HasScalarStores -multiclass SM_Real_Probe_gfx10<bits<8> op, string ps> { +multiclass SM_Real_Probe_gfx10<bits<8> op> { + defvar ps = NAME; def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_IMM)>; def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR)>; def _SGPR_IMM_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR_IMM)>; } -defm S_ATC_PROBE : SM_Real_Probe_gfx10 <0x26, "S_ATC_PROBE">; -defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx10 <0x27, "S_ATC_PROBE_BUFFER">; +defm S_ATC_PROBE : SM_Real_Probe_gfx10 <0x26>; +defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx10 <0x27>; class SMEM_Atomic_Real_gfx10 <bits<8> op, SM_Atomic_Pseudo ps> : SMEM_Real_gfx10 <op, ps>, @@ -1090,7 +1058,8 @@ class SMEM_Atomic_Real_gfx10 <bits<8> op, SM_Atomic_Pseudo ps> let Inst{12-6} = !if(ps.glc, sdst{6-0}, sdata{6-0}); } -multiclass SM_Real_Atomics_gfx10<bits<8> op, string ps> { +multiclass SM_Real_Atomics_gfx10<bits<8> op> { + defvar ps = NAME; def _IMM_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_IMM)>; def _SGPR_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>; def _SGPR_IMM_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_IMM)>; @@ -1101,70 +1070,71 @@ multiclass SM_Real_Atomics_gfx10<bits<8> op, string ps> { let SubtargetPredicate = HasScalarAtomics in { -defm S_BUFFER_ATOMIC_SWAP : SM_Real_Atomics_gfx10 <0x40, "S_BUFFER_ATOMIC_SWAP">; -defm S_BUFFER_ATOMIC_CMPSWAP : SM_Real_Atomics_gfx10 <0x41, "S_BUFFER_ATOMIC_CMPSWAP">; -defm S_BUFFER_ATOMIC_ADD : SM_Real_Atomics_gfx10 <0x42, "S_BUFFER_ATOMIC_ADD">; -defm S_BUFFER_ATOMIC_SUB : SM_Real_Atomics_gfx10 <0x43, "S_BUFFER_ATOMIC_SUB">; -defm S_BUFFER_ATOMIC_SMIN : SM_Real_Atomics_gfx10 <0x44, "S_BUFFER_ATOMIC_SMIN">; -defm S_BUFFER_ATOMIC_UMIN : SM_Real_Atomics_gfx10 <0x45, "S_BUFFER_ATOMIC_UMIN">; -defm S_BUFFER_ATOMIC_SMAX : SM_Real_Atomics_gfx10 <0x46, "S_BUFFER_ATOMIC_SMAX">; -defm S_BUFFER_ATOMIC_UMAX : SM_Real_Atomics_gfx10 <0x47, "S_BUFFER_ATOMIC_UMAX">; -defm S_BUFFER_ATOMIC_AND : SM_Real_Atomics_gfx10 <0x48, "S_BUFFER_ATOMIC_AND">; -defm S_BUFFER_ATOMIC_OR : SM_Real_Atomics_gfx10 <0x49, "S_BUFFER_ATOMIC_OR">; -defm S_BUFFER_ATOMIC_XOR : SM_Real_Atomics_gfx10 <0x4a, "S_BUFFER_ATOMIC_XOR">; -defm S_BUFFER_ATOMIC_INC : SM_Real_Atomics_gfx10 <0x4b, "S_BUFFER_ATOMIC_INC">; -defm S_BUFFER_ATOMIC_DEC : SM_Real_Atomics_gfx10 <0x4c, "S_BUFFER_ATOMIC_DEC">; +defm S_BUFFER_ATOMIC_SWAP : SM_Real_Atomics_gfx10 <0x40>; +defm S_BUFFER_ATOMIC_CMPSWAP : SM_Real_Atomics_gfx10 <0x41>; +defm S_BUFFER_ATOMIC_ADD : SM_Real_Atomics_gfx10 <0x42>; +defm S_BUFFER_ATOMIC_SUB : SM_Real_Atomics_gfx10 <0x43>; +defm S_BUFFER_ATOMIC_SMIN : SM_Real_Atomics_gfx10 <0x44>; +defm S_BUFFER_ATOMIC_UMIN : SM_Real_Atomics_gfx10 <0x45>; +defm S_BUFFER_ATOMIC_SMAX : SM_Real_Atomics_gfx10 <0x46>; +defm S_BUFFER_ATOMIC_UMAX : SM_Real_Atomics_gfx10 <0x47>; +defm S_BUFFER_ATOMIC_AND : SM_Real_Atomics_gfx10 <0x48>; +defm S_BUFFER_ATOMIC_OR : SM_Real_Atomics_gfx10 <0x49>; +defm S_BUFFER_ATOMIC_XOR : SM_Real_Atomics_gfx10 <0x4a>; +defm S_BUFFER_ATOMIC_INC : SM_Real_Atomics_gfx10 <0x4b>; +defm S_BUFFER_ATOMIC_DEC : SM_Real_Atomics_gfx10 <0x4c>; -defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Real_Atomics_gfx10 <0x60, "S_BUFFER_ATOMIC_SWAP_X2">; -defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_gfx10 <0x61, "S_BUFFER_ATOMIC_CMPSWAP_X2">; -defm S_BUFFER_ATOMIC_ADD_X2 : SM_Real_Atomics_gfx10 <0x62, "S_BUFFER_ATOMIC_ADD_X2">; -defm S_BUFFER_ATOMIC_SUB_X2 : SM_Real_Atomics_gfx10 <0x63, "S_BUFFER_ATOMIC_SUB_X2">; -defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Real_Atomics_gfx10 <0x64, "S_BUFFER_ATOMIC_SMIN_X2">; -defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Real_Atomics_gfx10 <0x65, "S_BUFFER_ATOMIC_UMIN_X2">; -defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Real_Atomics_gfx10 <0x66, "S_BUFFER_ATOMIC_SMAX_X2">; -defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Real_Atomics_gfx10 <0x67, "S_BUFFER_ATOMIC_UMAX_X2">; -defm S_BUFFER_ATOMIC_AND_X2 : SM_Real_Atomics_gfx10 <0x68, "S_BUFFER_ATOMIC_AND_X2">; -defm S_BUFFER_ATOMIC_OR_X2 : SM_Real_Atomics_gfx10 <0x69, "S_BUFFER_ATOMIC_OR_X2">; -defm S_BUFFER_ATOMIC_XOR_X2 : SM_Real_Atomics_gfx10 <0x6a, "S_BUFFER_ATOMIC_XOR_X2">; -defm S_BUFFER_ATOMIC_INC_X2 : SM_Real_Atomics_gfx10 <0x6b, "S_BUFFER_ATOMIC_INC_X2">; -defm S_BUFFER_ATOMIC_DEC_X2 : SM_Real_Atomics_gfx10 <0x6c, "S_BUFFER_ATOMIC_DEC_X2">; +defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Real_Atomics_gfx10 <0x60>; +defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_gfx10 <0x61>; +defm S_BUFFER_ATOMIC_ADD_X2 : SM_Real_Atomics_gfx10 <0x62>; +defm S_BUFFER_ATOMIC_SUB_X2 : SM_Real_Atomics_gfx10 <0x63>; +defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Real_Atomics_gfx10 <0x64>; +defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Real_Atomics_gfx10 <0x65>; +defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Real_Atomics_gfx10 <0x66>; +defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Real_Atomics_gfx10 <0x67>; +defm S_BUFFER_ATOMIC_AND_X2 : SM_Real_Atomics_gfx10 <0x68>; +defm S_BUFFER_ATOMIC_OR_X2 : SM_Real_Atomics_gfx10 <0x69>; +defm S_BUFFER_ATOMIC_XOR_X2 : SM_Real_Atomics_gfx10 <0x6a>; +defm S_BUFFER_ATOMIC_INC_X2 : SM_Real_Atomics_gfx10 <0x6b>; +defm S_BUFFER_ATOMIC_DEC_X2 : SM_Real_Atomics_gfx10 <0x6c>; -defm S_ATOMIC_SWAP : SM_Real_Atomics_gfx10 <0x80, "S_ATOMIC_SWAP">; -defm S_ATOMIC_CMPSWAP : SM_Real_Atomics_gfx10 <0x81, "S_ATOMIC_CMPSWAP">; -defm S_ATOMIC_ADD : SM_Real_Atomics_gfx10 <0x82, "S_ATOMIC_ADD">; -defm S_ATOMIC_SUB : SM_Real_Atomics_gfx10 <0x83, "S_ATOMIC_SUB">; -defm S_ATOMIC_SMIN : SM_Real_Atomics_gfx10 <0x84, "S_ATOMIC_SMIN">; -defm S_ATOMIC_UMIN : SM_Real_Atomics_gfx10 <0x85, "S_ATOMIC_UMIN">; -defm S_ATOMIC_SMAX : SM_Real_Atomics_gfx10 <0x86, "S_ATOMIC_SMAX">; -defm S_ATOMIC_UMAX : SM_Real_Atomics_gfx10 <0x87, "S_ATOMIC_UMAX">; -defm S_ATOMIC_AND : SM_Real_Atomics_gfx10 <0x88, "S_ATOMIC_AND">; -defm S_ATOMIC_OR : SM_Real_Atomics_gfx10 <0x89, "S_ATOMIC_OR">; -defm S_ATOMIC_XOR : SM_Real_Atomics_gfx10 <0x8a, "S_ATOMIC_XOR">; -defm S_ATOMIC_INC : SM_Real_Atomics_gfx10 <0x8b, "S_ATOMIC_INC">; -defm S_ATOMIC_DEC : SM_Real_Atomics_gfx10 <0x8c, "S_ATOMIC_DEC">; +defm S_ATOMIC_SWAP : SM_Real_Atomics_gfx10 <0x80>; +defm S_ATOMIC_CMPSWAP : SM_Real_Atomics_gfx10 <0x81>; +defm S_ATOMIC_ADD : SM_Real_Atomics_gfx10 <0x82>; +defm S_ATOMIC_SUB : SM_Real_Atomics_gfx10 <0x83>; +defm S_ATOMIC_SMIN : SM_Real_Atomics_gfx10 <0x84>; +defm S_ATOMIC_UMIN : SM_Real_Atomics_gfx10 <0x85>; +defm S_ATOMIC_SMAX : SM_Real_Atomics_gfx10 <0x86>; +defm S_ATOMIC_UMAX : SM_Real_Atomics_gfx10 <0x87>; +defm S_ATOMIC_AND : SM_Real_Atomics_gfx10 <0x88>; +defm S_ATOMIC_OR : SM_Real_Atomics_gfx10 <0x89>; +defm S_ATOMIC_XOR : SM_Real_Atomics_gfx10 <0x8a>; +defm S_ATOMIC_INC : SM_Real_Atomics_gfx10 <0x8b>; +defm S_ATOMIC_DEC : SM_Real_Atomics_gfx10 <0x8c>; -defm S_ATOMIC_SWAP_X2 : SM_Real_Atomics_gfx10 <0xa0, "S_ATOMIC_SWAP_X2">; -defm S_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_gfx10 <0xa1, "S_ATOMIC_CMPSWAP_X2">; -defm S_ATOMIC_ADD_X2 : SM_Real_Atomics_gfx10 <0xa2, "S_ATOMIC_ADD_X2">; -defm S_ATOMIC_SUB_X2 : SM_Real_Atomics_gfx10 <0xa3, "S_ATOMIC_SUB_X2">; -defm S_ATOMIC_SMIN_X2 : SM_Real_Atomics_gfx10 <0xa4, "S_ATOMIC_SMIN_X2">; -defm S_ATOMIC_UMIN_X2 : SM_Real_Atomics_gfx10 <0xa5, "S_ATOMIC_UMIN_X2">; -defm S_ATOMIC_SMAX_X2 : SM_Real_Atomics_gfx10 <0xa6, "S_ATOMIC_SMAX_X2">; -defm S_ATOMIC_UMAX_X2 : SM_Real_Atomics_gfx10 <0xa7, "S_ATOMIC_UMAX_X2">; -defm S_ATOMIC_AND_X2 : SM_Real_Atomics_gfx10 <0xa8, "S_ATOMIC_AND_X2">; -defm S_ATOMIC_OR_X2 : SM_Real_Atomics_gfx10 <0xa9, "S_ATOMIC_OR_X2">; -defm S_ATOMIC_XOR_X2 : SM_Real_Atomics_gfx10 <0xaa, "S_ATOMIC_XOR_X2">; -defm S_ATOMIC_INC_X2 : SM_Real_Atomics_gfx10 <0xab, "S_ATOMIC_INC_X2">; -defm S_ATOMIC_DEC_X2 : SM_Real_Atomics_gfx10 <0xac, "S_ATOMIC_DEC_X2">; +defm S_ATOMIC_SWAP_X2 : SM_Real_Atomics_gfx10 <0xa0>; +defm S_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_gfx10 <0xa1>; +defm S_ATOMIC_ADD_X2 : SM_Real_Atomics_gfx10 <0xa2>; +defm S_ATOMIC_SUB_X2 : SM_Real_Atomics_gfx10 <0xa3>; +defm S_ATOMIC_SMIN_X2 : SM_Real_Atomics_gfx10 <0xa4>; +defm S_ATOMIC_UMIN_X2 : SM_Real_Atomics_gfx10 <0xa5>; +defm S_ATOMIC_SMAX_X2 : SM_Real_Atomics_gfx10 <0xa6>; +defm S_ATOMIC_UMAX_X2 : SM_Real_Atomics_gfx10 <0xa7>; +defm S_ATOMIC_AND_X2 : SM_Real_Atomics_gfx10 <0xa8>; +defm S_ATOMIC_OR_X2 : SM_Real_Atomics_gfx10 <0xa9>; +defm S_ATOMIC_XOR_X2 : SM_Real_Atomics_gfx10 <0xaa>; +defm S_ATOMIC_INC_X2 : SM_Real_Atomics_gfx10 <0xab>; +defm S_ATOMIC_DEC_X2 : SM_Real_Atomics_gfx10 <0xac>; -multiclass SM_Real_Discard_gfx10<bits<8> op, string ps> { +multiclass SM_Real_Discard_gfx10<bits<8> op> { + defvar ps = NAME; def _IMM_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_IMM)>; def _SGPR_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR)>; def _SGPR_IMM_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR_IMM)>; } -defm S_DCACHE_DISCARD : SM_Real_Discard_gfx10 <0x28, "S_DCACHE_DISCARD">; -defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_gfx10 <0x29, "S_DCACHE_DISCARD_X2">; +defm S_DCACHE_DISCARD : SM_Real_Discard_gfx10 <0x28>; +defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_gfx10 <0x29>; } // End SubtargetPredicate = HasScalarAtomics @@ -1190,31 +1160,29 @@ class SMEM_Real_gfx11<bits<8> op, SM_Pseudo ps, string opName = ps.Mnemonic> : let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, 0); } -class SMEM_Real_Load_gfx11<bits<8> op, string ps, string opName, OffsetMode offsets> : - SMEM_Real_gfx11<op, !cast<SM_Pseudo>(ps # offsets.Variant), opName> { - RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass; - let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol)); -} +class SMEM_Real_Load_gfx11<bits<8> op, string ps, string opName> : + SMEM_Real_gfx11<op, !cast<SM_Pseudo>(ps), opName>; -multiclass SM_Real_Loads_gfx11<bits<8> op, string ps, string opName> { - def _IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps, opName, IMM_Offset>; - def _SGPR_gfx11 : SMEM_Real_Load_gfx11<op, ps, opName, SGPR_Offset>; - def _SGPR_IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps, opName, SGPR_IMM_Offset>; +multiclass SM_Real_Loads_gfx11<bits<8> op, string ps> { + defvar opName = !tolower(NAME); + def _IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_IMM", opName>; + def _SGPR_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_SGPR", opName>; + def _SGPR_IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_SGPR_IMM", opName>; def : MnemonicAlias<!cast<SM_Pseudo>(ps#"_IMM").Mnemonic, opName>, Requires<[isGFX11Plus]>; } -defm S_LOAD_B32 : SM_Real_Loads_gfx11<0x000, "S_LOAD_DWORD", "s_load_b32">; -defm S_LOAD_B64 : SM_Real_Loads_gfx11<0x001, "S_LOAD_DWORDX2", "s_load_b64">; -defm S_LOAD_B128 : SM_Real_Loads_gfx11<0x002, "S_LOAD_DWORDX4", "s_load_b128">; -defm S_LOAD_B256 : SM_Real_Loads_gfx11<0x003, "S_LOAD_DWORDX8", "s_load_b256">; -defm S_LOAD_B512 : SM_Real_Loads_gfx11<0x004, "S_LOAD_DWORDX16", "s_load_b512">; +defm S_LOAD_B32 : SM_Real_Loads_gfx11<0x000, "S_LOAD_DWORD">; +defm S_LOAD_B64 : SM_Real_Loads_gfx11<0x001, "S_LOAD_DWORDX2">; +defm S_LOAD_B128 : SM_Real_Loads_gfx11<0x002, "S_LOAD_DWORDX4">; +defm S_LOAD_B256 : SM_Real_Loads_gfx11<0x003, "S_LOAD_DWORDX8">; +defm S_LOAD_B512 : SM_Real_Loads_gfx11<0x004, "S_LOAD_DWORDX16">; -defm S_BUFFER_LOAD_B32 : SM_Real_Loads_gfx11<0x008, "S_BUFFER_LOAD_DWORD", "s_buffer_load_b32">; -defm S_BUFFER_LOAD_B64 : SM_Real_Loads_gfx11<0x009, "S_BUFFER_LOAD_DWORDX2", "s_buffer_load_b64">; -defm S_BUFFER_LOAD_B128 : SM_Real_Loads_gfx11<0x00a, "S_BUFFER_LOAD_DWORDX4", "s_buffer_load_b128">; -defm S_BUFFER_LOAD_B256 : SM_Real_Loads_gfx11<0x00b, "S_BUFFER_LOAD_DWORDX8", "s_buffer_load_b256">; -defm S_BUFFER_LOAD_B512 : SM_Real_Loads_gfx11<0x00c, "S_BUFFER_LOAD_DWORDX16", "s_buffer_load_b512">; +defm S_BUFFER_LOAD_B32 : SM_Real_Loads_gfx11<0x008, "S_BUFFER_LOAD_DWORD">; +defm S_BUFFER_LOAD_B64 : SM_Real_Loads_gfx11<0x009, "S_BUFFER_LOAD_DWORDX2">; +defm S_BUFFER_LOAD_B128 : SM_Real_Loads_gfx11<0x00a, "S_BUFFER_LOAD_DWORDX4">; +defm S_BUFFER_LOAD_B256 : SM_Real_Loads_gfx11<0x00b, "S_BUFFER_LOAD_DWORDX8">; +defm S_BUFFER_LOAD_B512 : SM_Real_Loads_gfx11<0x00c, "S_BUFFER_LOAD_DWORDX16">; def S_GL1_INV_gfx11 : SMEM_Real_gfx11<0x020, S_GL1_INV>; def S_DCACHE_INV_gfx11 : SMEM_Real_gfx11<0x021, S_DCACHE_INV>; @@ -1227,12 +1195,13 @@ class SMEM_Real_Store_gfx11 <bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx11<op, ps> let Inst{12-6} = !if(ps.has_sdst, sdata{6-0}, ?); } -multiclass SM_Real_Probe_gfx11<bits<8> op, string ps> { +multiclass SM_Real_Probe_gfx11<bits<8> op> { + defvar ps = NAME; def _IMM_gfx11 : SMEM_Real_Store_gfx11 <op, !cast<SM_Probe_Pseudo>(ps#_IMM)>; def _SGPR_gfx11 : SMEM_Real_Store_gfx11 <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>; def _SGPR_IMM_gfx11 : SMEM_Real_Store_gfx11 <op, !cast<SM_Probe_Pseudo>(ps#_SGPR_IMM)>; } -defm S_ATC_PROBE : SM_Real_Probe_gfx11 <0x22, "S_ATC_PROBE">; -defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx11 <0x23, "S_ATC_PROBE_BUFFER">; +defm S_ATC_PROBE : SM_Real_Probe_gfx11 <0x22>; +defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx11 <0x23>; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index ad9af662307f..bee996d1b0df 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -6,18 +6,7 @@ // //===----------------------------------------------------------------------===// -def GPRIdxModeMatchClass : AsmOperandClass { - let Name = "GPRIdxMode"; - let PredicateMethod = "isGPRIdxMode"; - let ParserMethod = "parseGPRIdxMode"; - let RenderMethod = "addImmOperands"; -} - -def GPRIdxMode : Operand<i32> { - let PrintMethod = "printVGPRIndexMode"; - let ParserMatchClass = GPRIdxModeMatchClass; - let OperandType = "OPERAND_IMMEDIATE"; -} +def GPRIdxMode : CustomOperand<i32>; class SOP_Pseudo<string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> : @@ -402,11 +391,11 @@ let SubtargetPredicate = isGFX11Plus in { // For s_sendmsg_rtn_* the src0 field encodes the message type directly; it // is not an SGPR number. def S_SENDMSG_RTN_B32 : SOP1_Pseudo< - "s_sendmsg_rtn_b32", (outs SReg_32:$sdst), (ins SendMsgImm:$src0), + "s_sendmsg_rtn_b32", (outs SReg_32:$sdst), (ins SendMsg:$src0), "$sdst, $src0", [(set i32:$sdst, (int_amdgcn_s_sendmsg_rtn timm:$src0))] >; def S_SENDMSG_RTN_B64 : SOP1_Pseudo< - "s_sendmsg_rtn_b64", (outs SReg_64:$sdst), (ins SendMsgImm:$src0), + "s_sendmsg_rtn_b64", (outs SReg_64:$sdst), (ins SendMsg:$src0), "$sdst, $src0", [(set i64:$sdst, (int_amdgcn_s_sendmsg_rtn timm:$src0))] >; } @@ -795,7 +784,7 @@ class SOPK_32 <string opName, list<dag> pattern=[]> : SOPK_Pseudo < class SOPK_32_BR <string opName, list<dag> pattern=[]> : SOPK_Pseudo < opName, (outs), - (ins sopp_brtarget:$simm16, SReg_32:$sdst), + (ins SOPPBrTarget:$simm16, SReg_32:$sdst), "$sdst, $simm16", pattern> { let Defs = [EXEC]; @@ -875,7 +864,7 @@ let isCommutable = 1, DisableEncoding = "$src0", let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in def S_CBRANCH_I_FORK : SOPK_Pseudo < "s_cbranch_i_fork", - (outs), (ins SReg_64:$sdst, sopp_brtarget:$simm16), + (outs), (ins SReg_64:$sdst, SOPPBrTarget:$simm16), "$sdst, $simm16" >; @@ -953,7 +942,7 @@ let SubtargetPredicate = isGFX9Plus in { def S_CALL_B64 : SOPK_Pseudo< "s_call_b64", (outs SReg_64:$sdst), - (ins sopp_brtarget:$simm16), + (ins SOPPBrTarget:$simm16), "$sdst, $simm16"> { let isCall = 1; } @@ -1175,7 +1164,7 @@ multiclass SOPP_With_Relaxation <string opName, dag ins, def S_NOP : SOPP_Pseudo<"s_nop" , (ins i16imm:$simm16), "$simm16">; let isTerminator = 1 in { -def S_ENDPGM : SOPP_Pseudo<"s_endpgm", (ins EndpgmImm:$simm16), "$simm16", [], ""> { +def S_ENDPGM : SOPP_Pseudo<"s_endpgm", (ins Endpgm:$simm16), "$simm16", [], ""> { let isBarrier = 1; let isReturn = 1; let hasSideEffects = 1; @@ -1206,60 +1195,60 @@ let SubtargetPredicate = isGFX10Plus in { let isBranch = 1, SchedRW = [WriteBranch] in { let isBarrier = 1 in { defm S_BRANCH : SOPP_With_Relaxation< - "s_branch" , (ins sopp_brtarget:$simm16), "$simm16", + "s_branch" , (ins SOPPBrTarget:$simm16), "$simm16", [(br bb:$simm16)]>; } let Uses = [SCC] in { defm S_CBRANCH_SCC0 : SOPP_With_Relaxation< - "s_cbranch_scc0" , (ins sopp_brtarget:$simm16), + "s_cbranch_scc0" , (ins SOPPBrTarget:$simm16), "$simm16" >; defm S_CBRANCH_SCC1 : SOPP_With_Relaxation < - "s_cbranch_scc1" , (ins sopp_brtarget:$simm16), + "s_cbranch_scc1" , (ins SOPPBrTarget:$simm16), "$simm16" >; } // End Uses = [SCC] let Uses = [VCC] in { defm S_CBRANCH_VCCZ : SOPP_With_Relaxation < - "s_cbranch_vccz" , (ins sopp_brtarget:$simm16), + "s_cbranch_vccz" , (ins SOPPBrTarget:$simm16), "$simm16" >; defm S_CBRANCH_VCCNZ : SOPP_With_Relaxation < - "s_cbranch_vccnz" , (ins sopp_brtarget:$simm16), + "s_cbranch_vccnz" , (ins SOPPBrTarget:$simm16), "$simm16" >; } // End Uses = [VCC] let Uses = [EXEC] in { defm S_CBRANCH_EXECZ : SOPP_With_Relaxation < - "s_cbranch_execz" , (ins sopp_brtarget:$simm16), + "s_cbranch_execz" , (ins SOPPBrTarget:$simm16), "$simm16" >; defm S_CBRANCH_EXECNZ : SOPP_With_Relaxation < - "s_cbranch_execnz" , (ins sopp_brtarget:$simm16), + "s_cbranch_execnz" , (ins SOPPBrTarget:$simm16), "$simm16" >; } // End Uses = [EXEC] defm S_CBRANCH_CDBGSYS : SOPP_With_Relaxation < - "s_cbranch_cdbgsys" , (ins sopp_brtarget:$simm16), + "s_cbranch_cdbgsys" , (ins SOPPBrTarget:$simm16), "$simm16" >; defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_With_Relaxation < - "s_cbranch_cdbgsys_and_user" , (ins sopp_brtarget:$simm16), + "s_cbranch_cdbgsys_and_user" , (ins SOPPBrTarget:$simm16), "$simm16" >; defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_With_Relaxation < - "s_cbranch_cdbgsys_or_user" , (ins sopp_brtarget:$simm16), + "s_cbranch_cdbgsys_or_user" , (ins SOPPBrTarget:$simm16), "$simm16" >; defm S_CBRANCH_CDBGUSER : SOPP_With_Relaxation < - "s_cbranch_cdbguser" , (ins sopp_brtarget:$simm16), + "s_cbranch_cdbguser" , (ins SOPPBrTarget:$simm16), "$simm16" >; @@ -1284,7 +1273,7 @@ def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > { } let hasSideEffects = 1 in -def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins WAIT_FLAG:$simm16), "$simm16", +def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins SWaitCnt:$simm16), "$simm16", [(int_amdgcn_s_waitcnt timm:$simm16)]>; def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sethalt timm:$simm16)]>; @@ -1305,12 +1294,12 @@ def S_SETPRIO : SOPP_Pseudo <"s_setprio", (ins i16imm:$simm16), "$simm16", } let Uses = [EXEC, M0] in { -def S_SENDMSG : SOPP_Pseudo <"s_sendmsg" , (ins SendMsgImm:$simm16), "$simm16", +def S_SENDMSG : SOPP_Pseudo <"s_sendmsg" , (ins SendMsg:$simm16), "$simm16", [(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]> { let hasSideEffects = 1; } -def S_SENDMSGHALT : SOPP_Pseudo <"s_sendmsghalt" , (ins SendMsgImm:$simm16), "$simm16", +def S_SENDMSGHALT : SOPP_Pseudo <"s_sendmsghalt" , (ins SendMsg:$simm16), "$simm16", [(int_amdgcn_s_sendmsghalt (i32 timm:$simm16), M0)]> { let hasSideEffects = 1; } @@ -1367,7 +1356,7 @@ let SubtargetPredicate = isGFX10Plus in { let fixed_imm = 1; } def S_WAITCNT_DEPCTR : - SOPP_Pseudo <"s_waitcnt_depctr" , (ins DepCtrImm:$simm16), "$simm16">; + SOPP_Pseudo <"s_waitcnt_depctr" , (ins DepCtr:$simm16), "$simm16">; let hasSideEffects = 0, Uses = [MODE], Defs = [MODE] in { def S_ROUND_MODE : @@ -1386,7 +1375,7 @@ let SubtargetPredicate = isGFX11Plus in { "$simm16"> { let hasSideEffects = 1; } - def S_DELAY_ALU : SOPP_Pseudo<"s_delay_alu", (ins DELAY_FLAG:$simm16), + def S_DELAY_ALU : SOPP_Pseudo<"s_delay_alu", (ins SDelayALU:$simm16), "$simm16">; } // End SubtargetPredicate = isGFX11Plus diff --git a/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp index 7573af597056..98fd16e59bf1 100644 --- a/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp +++ b/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp @@ -15,13 +15,13 @@ using namespace llvm; -/// The target which supports all AMD GPUs. This will eventually -/// be deprecated and there will be a R600 target and a GCN target. -Target &llvm::getTheAMDGPUTarget() { +/// The target for R600 GPUs. +Target &llvm::getTheR600Target() { static Target TheAMDGPUTarget; return TheAMDGPUTarget; } -/// The target for GCN GPUs + +/// The target for GCN GPUs. Target &llvm::getTheGCNTarget() { static Target TheGCNTarget; return TheGCNTarget; @@ -29,7 +29,7 @@ Target &llvm::getTheGCNTarget() { /// Extern function to initialize the targets for the AMDGPU backend extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetInfo() { - RegisterTarget<Triple::r600, false> R600(getTheAMDGPUTarget(), "r600", + RegisterTarget<Triple::r600, false> R600(getTheR600Target(), "r600", "AMD GPUs HD2XXX-HD6XXX", "AMDGPU"); RegisterTarget<Triple::amdgcn, false> GCN(getTheGCNTarget(), "amdgcn", "AMD GCN GPUs", "AMDGPU"); diff --git a/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h b/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h index 1e6dbd90b0c1..45470167a331 100644 --- a/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h +++ b/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h @@ -17,11 +17,10 @@ namespace llvm { class Target; -/// The target which supports all AMD GPUs. This will eventually -/// be deprecated and there will be a R600 target and a GCN target. -Target &getTheAMDGPUTarget(); +/// The target for R600 GPUs. +Target &getTheR600Target(); -/// The target for GCN GPUs +/// The target for GCN GPUs. Target &getTheGCNTarget(); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index c0fd5bc69325..ce40d82021cf 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -44,7 +44,7 @@ const CustomOperand<const MCSubtargetInfo &> Msg[] = { {{"MSG_SAVEWAVE"}, ID_SAVEWAVE, isGFX8_GFX9_GFX10}, {{"MSG_STALL_WAVE_GEN"}, ID_STALL_WAVE_GEN, isGFX9Plus}, {{"MSG_HALT_WAVES"}, ID_HALT_WAVES, isGFX9Plus}, - {{"MSG_ORDERED_PS_DONE"}, ID_ORDERED_PS_DONE, isGFX9Plus}, + {{"MSG_ORDERED_PS_DONE"}, ID_ORDERED_PS_DONE, isGFX9_GFX10}, {{"MSG_EARLY_PRIM_DEALLOC"}, ID_EARLY_PRIM_DEALLOC, isGFX9_GFX10}, {{"MSG_GS_ALLOC_REQ"}, ID_GS_ALLOC_REQ, isGFX9Plus}, {{"MSG_GET_DOORBELL"}, ID_GET_DOORBELL, isGFX9_GFX10}, @@ -115,10 +115,14 @@ const CustomOperand<const MCSubtargetInfo &> Opr[] = { {{"HW_REG_HW_ID2"}, ID_HW_ID2, isGFX10Plus}, {{"HW_REG_POPS_PACKER"}, ID_POPS_PACKER, isGFX10}, {{""}}, - {{""}}, + {{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA, isGFX11Plus}, {{""}}, {{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_BEncoding}, + // Register numbers reused in GFX11+ + {{"HW_REG_PERF_SNAPSHOT_PC_LO"}, ID_PERF_SNAPSHOT_PC_LO, isGFX11Plus}, + {{"HW_REG_PERF_SNAPSHOT_PC_HI"}, ID_PERF_SNAPSHOT_PC_HI, isGFX11Plus}, + // GFX940 specific registers {{"HW_REG_XCC_ID"}, ID_XCC_ID, isGFX940}, {{"HW_REG_SQ_PERF_SNAPSHOT_DATA"}, ID_SQ_PERF_SNAPSHOT_DATA, isGFX940}, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 4263e3e9eeac..296ea18b2a8d 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -10,19 +10,22 @@ #include "AMDGPU.h" #include "AMDGPUAsmUtils.h" #include "AMDKernelCodeT.h" -#include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/BinaryFormat/ELF.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/Attributes.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/TargetParser.h" +#include "llvm/TargetParser/TargetParser.h" #include <optional> #define GET_INSTRINFO_NAMED_OPS @@ -92,6 +95,24 @@ unsigned getVmcntBitWidthHi(unsigned VersionMajor) { return (VersionMajor == 9 || VersionMajor == 10) ? 2 : 0; } +/// \returns VmVsrc bit width +inline unsigned getVmVsrcBitWidth() { return 3; } + +/// \returns VmVsrc bit shift +inline unsigned getVmVsrcBitShift() { return 2; } + +/// \returns VaVdst bit width +inline unsigned getVaVdstBitWidth() { return 4; } + +/// \returns VaVdst bit shift +inline unsigned getVaVdstBitShift() { return 12; } + +/// \returns SaSdst bit width +inline unsigned getSaSdstBitWidth() { return 1; } + +/// \returns SaSdst bit shift +inline unsigned getSaSdstBitShift() { return 0; } + } // end namespace anonymous namespace llvm { @@ -150,56 +171,62 @@ unsigned getAmdhsaCodeObjectVersion() { return AmdhsaCodeObjectVersion; } -unsigned getMultigridSyncArgImplicitArgPosition() { - switch (AmdhsaCodeObjectVersion) { - case 2: - case 3: - case 4: +unsigned getCodeObjectVersion(const Module &M) { + if (auto Ver = mdconst::extract_or_null<ConstantInt>( + M.getModuleFlag("amdgpu_code_object_version"))) { + return (unsigned)Ver->getZExtValue() / 100; + } + + // Default code object version. + return AMDHSA_COV4; +} + +unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion) { + switch (CodeObjectVersion) { + case AMDHSA_COV2: + case AMDHSA_COV3: + case AMDHSA_COV4: return 48; - case 5: - return AMDGPU::ImplicitArg::MULTIGRID_SYNC_ARG_OFFSET; + case AMDHSA_COV5: default: - llvm_unreachable("Unexpected code object version"); - return 0; + return AMDGPU::ImplicitArg::MULTIGRID_SYNC_ARG_OFFSET; } } // FIXME: All such magic numbers about the ABI should be in a // central TD file. -unsigned getHostcallImplicitArgPosition() { - switch (AmdhsaCodeObjectVersion) { - case 2: - case 3: - case 4: +unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion) { + switch (CodeObjectVersion) { + case AMDHSA_COV2: + case AMDHSA_COV3: + case AMDHSA_COV4: return 24; - case 5: - return AMDGPU::ImplicitArg::HOSTCALL_PTR_OFFSET; + case AMDHSA_COV5: default: - llvm_unreachable("Unexpected code object version"); - return 0; + return AMDGPU::ImplicitArg::HOSTCALL_PTR_OFFSET; } } -unsigned getDefaultQueueImplicitArgPosition() { - switch (AmdhsaCodeObjectVersion) { - case 2: - case 3: - case 4: +unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion) { + switch (CodeObjectVersion) { + case AMDHSA_COV2: + case AMDHSA_COV3: + case AMDHSA_COV4: return 32; - case 5: + case AMDHSA_COV5: default: return AMDGPU::ImplicitArg::DEFAULT_QUEUE_OFFSET; } } -unsigned getCompletionActionImplicitArgPosition() { - switch (AmdhsaCodeObjectVersion) { - case 2: - case 3: - case 4: +unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion) { + switch (CodeObjectVersion) { + case AMDHSA_COV2: + case AMDHSA_COV3: + case AMDHSA_COV4: return 40; - case 5: + case AMDHSA_COV5: default: return AMDGPU::ImplicitArg::COMPLETION_ACTION_OFFSET; } @@ -568,9 +595,10 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex( unsigned CompOprIdx; for (CompOprIdx = 0; CompOprIdx < Component::MAX_OPR_NUM; ++CompOprIdx) { - unsigned BanksNum = BANKS_NUM[CompOprIdx]; + unsigned BanksMasks = VOPD_VGPR_BANK_MASKS[CompOprIdx]; if (OpXRegs[CompOprIdx] && OpYRegs[CompOprIdx] && - (OpXRegs[CompOprIdx] % BanksNum == OpYRegs[CompOprIdx] % BanksNum)) + ((OpXRegs[CompOprIdx] & BanksMasks) == + (OpYRegs[CompOprIdx] & BanksMasks))) return CompOprIdx; } @@ -624,7 +652,7 @@ namespace IsaInfo { AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI) : STI(STI), XnackSetting(TargetIDSetting::Any), - SramEccSetting(TargetIDSetting::Any) { + SramEccSetting(TargetIDSetting::Any), CodeObjectVersion(0) { if (!STI.getFeatureBits().test(FeatureSupportsXNACK)) XnackSetting = TargetIDSetting::Unsupported; if (!STI.getFeatureBits().test(FeatureSupportsSRAMECC)) @@ -735,9 +763,9 @@ std::string AMDGPUTargetID::toString() const { .str(); std::string Features; - if (std::optional<uint8_t> HsaAbiVersion = getHsaAbiVersion(&STI)) { - switch (*HsaAbiVersion) { - case ELF::ELFABIVERSION_AMDGPU_HSA_V2: + if (STI.getTargetTriple().getOS() == Triple::AMDHSA) { + switch (CodeObjectVersion) { + case AMDGPU::AMDHSA_COV2: // Code object V2 only supported specific processors and had fixed // settings for the XNACK. if (Processor == "gfx600") { @@ -785,7 +813,7 @@ std::string AMDGPUTargetID::toString() const { Twine(Processor)); } break; - case ELF::ELFABIVERSION_AMDGPU_HSA_V3: + case AMDGPU::AMDHSA_COV3: // xnack. if (isXnackOnOrAny()) Features += "+xnack"; @@ -794,8 +822,8 @@ std::string AMDGPUTargetID::toString() const { if (isSramEccOnOrAny()) Features += "+sram-ecc"; break; - case ELF::ELFABIVERSION_AMDGPU_HSA_V4: - case ELF::ELFABIVERSION_AMDGPU_HSA_V5: + case AMDGPU::AMDHSA_COV4: + case AMDGPU::AMDHSA_COV5: // sramecc. if (getSramEccSetting() == TargetIDSetting::Off) Features += ":sramecc-"; @@ -1205,16 +1233,16 @@ bool shouldEmitConstantsToTextSection(const Triple &TT) { return TT.getArch() == Triple::r600; } -std::pair<int, int> getIntegerPairAttribute(const Function &F, - StringRef Name, - std::pair<int, int> Default, - bool OnlyFirstRequired) { +std::pair<unsigned, unsigned> +getIntegerPairAttribute(const Function &F, StringRef Name, + std::pair<unsigned, unsigned> Default, + bool OnlyFirstRequired) { Attribute A = F.getFnAttribute(Name); if (!A.isStringAttribute()) return Default; LLVMContext &Ctx = F.getContext(); - std::pair<int, int> Ints = Default; + std::pair<unsigned, unsigned> Ints = Default; std::pair<StringRef, StringRef> Strs = A.getValueAsString().split(','); if (Strs.first.trim().getAsInteger(0, Ints.first)) { Ctx.emitError("can't parse first integer attribute " + Name); @@ -1491,6 +1519,42 @@ int encodeDepCtr(const StringRef Name, int64_t Val, unsigned &UsedOprMask, STI); } +unsigned decodeFieldVmVsrc(unsigned Encoded) { + return unpackBits(Encoded, getVmVsrcBitShift(), getVmVsrcBitWidth()); +} + +unsigned decodeFieldVaVdst(unsigned Encoded) { + return unpackBits(Encoded, getVaVdstBitShift(), getVaVdstBitWidth()); +} + +unsigned decodeFieldSaSdst(unsigned Encoded) { + return unpackBits(Encoded, getSaSdstBitShift(), getSaSdstBitWidth()); +} + +unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc) { + return packBits(VmVsrc, Encoded, getVmVsrcBitShift(), getVmVsrcBitWidth()); +} + +unsigned encodeFieldVmVsrc(unsigned VmVsrc) { + return encodeFieldVmVsrc(0xffff, VmVsrc); +} + +unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst) { + return packBits(VaVdst, Encoded, getVaVdstBitShift(), getVaVdstBitWidth()); +} + +unsigned encodeFieldVaVdst(unsigned VaVdst) { + return encodeFieldVaVdst(0xffff, VaVdst); +} + +unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst) { + return packBits(SaSdst, Encoded, getSaSdstBitShift(), getSaSdstBitWidth()); +} + +unsigned encodeFieldSaSdst(unsigned SaSdst) { + return encodeFieldSaSdst(0xffff, SaSdst); +} + } // namespace DepCtr //===----------------------------------------------------------------------===// @@ -1913,44 +1977,53 @@ bool isKernelCC(const Function *Func) { } bool hasXNACK(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureXNACK]; + return STI.hasFeature(AMDGPU::FeatureXNACK); } bool hasSRAMECC(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureSRAMECC]; + return STI.hasFeature(AMDGPU::FeatureSRAMECC); } bool hasMIMG_R128(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128] && !STI.getFeatureBits()[AMDGPU::FeatureR128A16]; + return STI.hasFeature(AMDGPU::FeatureMIMG_R128) && !STI.hasFeature(AMDGPU::FeatureR128A16); } bool hasA16(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureA16]; + return STI.hasFeature(AMDGPU::FeatureA16); } bool hasG16(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureG16]; + return STI.hasFeature(AMDGPU::FeatureG16); } bool hasPackedD16(const MCSubtargetInfo &STI) { - return !STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem] && !isCI(STI) && + return !STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem) && !isCI(STI) && !isSI(STI); } +unsigned getNSAMaxSize(const MCSubtargetInfo &STI) { + auto Version = getIsaVersion(STI.getCPU()); + if (Version.Major == 10) + return Version.Minor >= 3 ? 13 : 5; + if (Version.Major == 11) + return 5; + return 0; +} + bool isSI(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands]; + return STI.hasFeature(AMDGPU::FeatureSouthernIslands); } bool isCI(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureSeaIslands]; + return STI.hasFeature(AMDGPU::FeatureSeaIslands); } bool isVI(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]; + return STI.hasFeature(AMDGPU::FeatureVolcanicIslands); } bool isGFX9(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureGFX9]; + return STI.hasFeature(AMDGPU::FeatureGFX9); } bool isGFX9_GFX10(const MCSubtargetInfo &STI) { @@ -1970,7 +2043,7 @@ bool isGFX9Plus(const MCSubtargetInfo &STI) { } bool isGFX10(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureGFX10]; + return STI.hasFeature(AMDGPU::FeatureGFX10); } bool isGFX10Plus(const MCSubtargetInfo &STI) { @@ -1978,7 +2051,7 @@ bool isGFX10Plus(const MCSubtargetInfo &STI) { } bool isGFX11(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureGFX11]; + return STI.hasFeature(AMDGPU::FeatureGFX11); } bool isGFX11Plus(const MCSubtargetInfo &STI) { @@ -1998,39 +2071,39 @@ bool isGFX10Before1030(const MCSubtargetInfo &STI) { } bool isGCN3Encoding(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding]; + return STI.hasFeature(AMDGPU::FeatureGCN3Encoding); } bool isGFX10_AEncoding(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureGFX10_AEncoding]; + return STI.hasFeature(AMDGPU::FeatureGFX10_AEncoding); } bool isGFX10_BEncoding(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]; + return STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding); } bool hasGFX10_3Insts(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureGFX10_3Insts]; + return STI.hasFeature(AMDGPU::FeatureGFX10_3Insts); } bool isGFX90A(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]; + return STI.hasFeature(AMDGPU::FeatureGFX90AInsts); } bool isGFX940(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureGFX940Insts]; + return STI.hasFeature(AMDGPU::FeatureGFX940Insts); } bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch]; + return STI.hasFeature(AMDGPU::FeatureArchitectedFlatScratch); } bool hasMAIInsts(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureMAIInsts]; + return STI.hasFeature(AMDGPU::FeatureMAIInsts); } bool hasVOPD(const MCSubtargetInfo &STI) { - return STI.getFeatureBits()[AMDGPU::FeatureVOPD]; + return STI.hasFeature(AMDGPU::FeatureVOPD); } int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, @@ -2350,11 +2423,15 @@ unsigned getRegBitWidth(const MCRegisterClass &RC) { return getRegBitWidth(RC.getID()); } +unsigned getRegBitWidth(const TargetRegisterClass &RC) { + return getRegBitWidth(RC.getID()); +} + unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc, unsigned OpNo) { assert(OpNo < Desc.NumOperands); unsigned RCID = Desc.operands()[OpNo].RegClass; - return getRegBitWidth(MRI->getRegClass(RCID)) / 8; + return getRegBitWidth(RCID) / 8; } bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) { @@ -2362,15 +2439,15 @@ bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) { return true; uint64_t Val = static_cast<uint64_t>(Literal); - return (Val == DoubleToBits(0.0)) || - (Val == DoubleToBits(1.0)) || - (Val == DoubleToBits(-1.0)) || - (Val == DoubleToBits(0.5)) || - (Val == DoubleToBits(-0.5)) || - (Val == DoubleToBits(2.0)) || - (Val == DoubleToBits(-2.0)) || - (Val == DoubleToBits(4.0)) || - (Val == DoubleToBits(-4.0)) || + return (Val == llvm::bit_cast<uint64_t>(0.0)) || + (Val == llvm::bit_cast<uint64_t>(1.0)) || + (Val == llvm::bit_cast<uint64_t>(-1.0)) || + (Val == llvm::bit_cast<uint64_t>(0.5)) || + (Val == llvm::bit_cast<uint64_t>(-0.5)) || + (Val == llvm::bit_cast<uint64_t>(2.0)) || + (Val == llvm::bit_cast<uint64_t>(-2.0)) || + (Val == llvm::bit_cast<uint64_t>(4.0)) || + (Val == llvm::bit_cast<uint64_t>(-4.0)) || (Val == 0x3fc45f306dc9c882 && HasInv2Pi); } @@ -2388,15 +2465,15 @@ bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) { // floating-point, so it is a legal inline immediate. uint32_t Val = static_cast<uint32_t>(Literal); - return (Val == FloatToBits(0.0f)) || - (Val == FloatToBits(1.0f)) || - (Val == FloatToBits(-1.0f)) || - (Val == FloatToBits(0.5f)) || - (Val == FloatToBits(-0.5f)) || - (Val == FloatToBits(2.0f)) || - (Val == FloatToBits(-2.0f)) || - (Val == FloatToBits(4.0f)) || - (Val == FloatToBits(-4.0f)) || + return (Val == llvm::bit_cast<uint32_t>(0.0f)) || + (Val == llvm::bit_cast<uint32_t>(1.0f)) || + (Val == llvm::bit_cast<uint32_t>(-1.0f)) || + (Val == llvm::bit_cast<uint32_t>(0.5f)) || + (Val == llvm::bit_cast<uint32_t>(-0.5f)) || + (Val == llvm::bit_cast<uint32_t>(2.0f)) || + (Val == llvm::bit_cast<uint32_t>(-2.0f)) || + (Val == llvm::bit_cast<uint32_t>(4.0f)) || + (Val == llvm::bit_cast<uint32_t>(-4.0f)) || (Val == 0x3e22f983 && HasInv2Pi); } @@ -2475,10 +2552,35 @@ bool isArgPassedInSGPR(const Argument *A) { case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: case CallingConv::AMDGPU_Gfx: - // For non-compute shaders, SGPR inputs are marked with either inreg or byval. - // Everything else is in VGPRs. - return F->getAttributes().hasParamAttr(A->getArgNo(), Attribute::InReg) || - F->getAttributes().hasParamAttr(A->getArgNo(), Attribute::ByVal); + // For non-compute shaders, SGPR inputs are marked with either inreg or + // byval. Everything else is in VGPRs. + return A->hasAttribute(Attribute::InReg) || + A->hasAttribute(Attribute::ByVal); + default: + // TODO: Should calls support inreg for SGPR inputs? + return false; + } +} + +bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo) { + // Arguments to compute shaders are never a source of divergence. + CallingConv::ID CC = CB->getCallingConv(); + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return true; + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_LS: + case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_ES: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_Gfx: + // For non-compute shaders, SGPR inputs are marked with either inreg or + // byval. Everything else is in VGPRs. + return CB->paramHasAttr(ArgNo, Attribute::InReg) || + CB->paramHasAttr(ArgNo, Attribute::ByVal); default: // TODO: Should calls support inreg for SGPR inputs? return false; @@ -2556,77 +2658,6 @@ unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST) { return 13; } -// Given Imm, split it into the values to put into the SOffset and ImmOffset -// fields in an MUBUF instruction. Return false if it is not possible (due to a -// hardware bug needing a workaround). -// -// The required alignment ensures that individual address components remain -// aligned if they are aligned to begin with. It also ensures that additional -// offsets within the given alignment can be added to the resulting ImmOffset. -bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, - const GCNSubtarget *Subtarget, Align Alignment) { - const uint32_t MaxImm = alignDown(4095, Alignment.value()); - uint32_t Overflow = 0; - - if (Imm > MaxImm) { - if (Imm <= MaxImm + 64) { - // Use an SOffset inline constant for 4..64 - Overflow = Imm - MaxImm; - Imm = MaxImm; - } else { - // Try to keep the same value in SOffset for adjacent loads, so that - // the corresponding register contents can be re-used. - // - // Load values with all low-bits (except for alignment bits) set into - // SOffset, so that a larger range of values can be covered using - // s_movk_i32. - // - // Atomic operations fail to work correctly when individual address - // components are unaligned, even if their sum is aligned. - uint32_t High = (Imm + Alignment.value()) & ~4095; - uint32_t Low = (Imm + Alignment.value()) & 4095; - Imm = Low; - Overflow = High - Alignment.value(); - } - } - - // There is a hardware bug in SI and CI which prevents address clamping in - // MUBUF instructions from working correctly with SOffsets. The immediate - // offset is unaffected. - if (Overflow > 0 && - Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) - return false; - - ImmOffset = Imm; - SOffset = Overflow; - return true; -} - -SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) { - *this = getDefaultForCallingConv(F.getCallingConv()); - - StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString(); - if (!IEEEAttr.empty()) - IEEE = IEEEAttr == "true"; - - StringRef DX10ClampAttr - = F.getFnAttribute("amdgpu-dx10-clamp").getValueAsString(); - if (!DX10ClampAttr.empty()) - DX10Clamp = DX10ClampAttr == "true"; - - StringRef DenormF32Attr = F.getFnAttribute("denormal-fp-math-f32").getValueAsString(); - if (!DenormF32Attr.empty()) - FP32Denormals = parseDenormalFPAttribute(DenormF32Attr); - - StringRef DenormAttr = F.getFnAttribute("denormal-fp-math").getValueAsString(); - if (!DenormAttr.empty()) { - DenormalMode DenormMode = parseDenormalFPAttribute(DenormAttr); - if (DenormF32Attr.empty()) - FP32Denormals = DenormMode; - FP64FP16Denormals = DenormMode; - } -} - namespace { struct SourceOfDivergence { @@ -2634,7 +2665,13 @@ struct SourceOfDivergence { }; const SourceOfDivergence *lookupSourceOfDivergence(unsigned Intr); +struct AlwaysUniform { + unsigned Intr; +}; +const AlwaysUniform *lookupAlwaysUniform(unsigned Intr); + #define GET_SourcesOfDivergence_IMPL +#define GET_UniformIntrinsics_IMPL #define GET_Gfx9BufferFormat_IMPL #define GET_Gfx10BufferFormat_IMPL #define GET_Gfx11PlusBufferFormat_IMPL @@ -2646,6 +2683,10 @@ bool isIntrinsicSourceOfDivergence(unsigned IntrID) { return lookupSourceOfDivergence(IntrID); } +bool isIntrinsicAlwaysUniform(unsigned IntrID) { + return lookupAlwaysUniform(IntrID); +} + const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 4d3423592353..bdf7ccad9c76 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -10,8 +10,9 @@ #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H #include "SIDefines.h" -#include "llvm/ADT/FloatingPointMode.h" #include "llvm/IR/CallingConv.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Module.h" #include "llvm/Support/Alignment.h" #include <array> #include <functional> @@ -24,13 +25,13 @@ namespace llvm { struct Align; class Argument; class Function; -class GCNSubtarget; class GlobalValue; class MCInstrInfo; class MCRegisterClass; class MCRegisterInfo; class MCSubtargetInfo; class StringRef; +class TargetRegisterClass; class Triple; class raw_ostream; @@ -42,6 +43,13 @@ namespace AMDGPU { struct IsaVersion; +enum { + AMDHSA_COV2 = 2, + AMDHSA_COV3 = 3, + AMDHSA_COV4 = 4, + AMDHSA_COV5 = 5 +}; + /// \returns HSA OS ABI Version identification. std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI); /// \returns True if HSA OS ABI Version identification is 2, @@ -61,17 +69,20 @@ bool isHsaAbiVersion5(const MCSubtargetInfo *STI); bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI); /// \returns The offset of the multigrid_sync_arg argument from implicitarg_ptr -unsigned getMultigridSyncArgImplicitArgPosition(); +unsigned getMultigridSyncArgImplicitArgPosition(unsigned COV); /// \returns The offset of the hostcall pointer argument from implicitarg_ptr -unsigned getHostcallImplicitArgPosition(); +unsigned getHostcallImplicitArgPosition(unsigned COV); -unsigned getDefaultQueueImplicitArgPosition(); -unsigned getCompletionActionImplicitArgPosition(); +unsigned getDefaultQueueImplicitArgPosition(unsigned COV); +unsigned getCompletionActionImplicitArgPosition(unsigned COV); /// \returns Code object version. unsigned getAmdhsaCodeObjectVersion(); +/// \returns Code object version. +unsigned getCodeObjectVersion(const Module &M); + struct GcnBufferFormatInfo { unsigned Format; unsigned BitsPerComp; @@ -116,6 +127,7 @@ private: const MCSubtargetInfo &STI; TargetIDSetting XnackSetting; TargetIDSetting SramEccSetting; + unsigned CodeObjectVersion; public: explicit AMDGPUTargetID(const MCSubtargetInfo &STI); @@ -145,6 +157,10 @@ public: return XnackSetting; } + void setCodeObjectVersion(unsigned COV) { + CodeObjectVersion = COV; + } + /// Sets xnack setting to \p NewXnackSetting. void setXnackSetting(TargetIDSetting NewXnackSetting) { XnackSetting = NewXnackSetting; @@ -339,6 +355,7 @@ struct MIMGBaseOpcodeInfo { bool HasD16; bool MSAA; bool BVH; + bool A16; }; LLVM_READONLY @@ -544,8 +561,9 @@ enum Component : unsigned { MAX_OPR_NUM = DST_NUM + MAX_SRC_NUM }; -// Number of VGPR banks per VOPD component operand. -constexpr unsigned BANKS_NUM[] = {2, 4, 4, 2}; +// LSB mask for VGPR banks per VOPD component operand. +// 4 banks result in a mask 3, setting 2 lower bits. +constexpr unsigned VOPD_VGPR_BANK_MASKS[] = {1, 3, 3, 1}; enum ComponentIndex : unsigned { X = 0, Y = 1 }; constexpr unsigned COMPONENTS[] = {ComponentIndex::X, ComponentIndex::Y}; @@ -555,7 +573,7 @@ constexpr unsigned COMPONENTS_NUM = 2; class ComponentProps { private: unsigned SrcOperandsNum = 0; - std::optional<unsigned> MandatoryLiteralIdx; + unsigned MandatoryLiteralIdx = ~0u; bool HasSrc2Acc = false; public: @@ -571,13 +589,13 @@ public: } // Return true iif this component has a mandatory literal. - bool hasMandatoryLiteral() const { return MandatoryLiteralIdx.has_value(); } + bool hasMandatoryLiteral() const { return MandatoryLiteralIdx != ~0u; } // If this component has a mandatory literal, return component operand // index of this literal (i.e. either Component::SRC1 or Component::SRC2). unsigned getMandatoryLiteralCompOperandIndex() const { assert(hasMandatoryLiteral()); - return *MandatoryLiteralIdx; + return MandatoryLiteralIdx; } // Return true iif this component has operand @@ -593,8 +611,7 @@ public: private: bool hasMandatoryLiteralAt(unsigned CompSrcIdx) const { assert(CompSrcIdx < Component::MAX_SRC_NUM); - return hasMandatoryLiteral() && - *MandatoryLiteralIdx == Component::DST_NUM + CompSrcIdx; + return MandatoryLiteralIdx == Component::DST_NUM + CompSrcIdx; } }; @@ -811,10 +828,10 @@ int getIntegerAttribute(const Function &F, StringRef Name, int Default); /// \returns \p Default and emits error if one of the requested values cannot be /// converted to integer, or \p OnlyFirstRequired is false and "second" value is /// not present. -std::pair<int, int> getIntegerPairAttribute(const Function &F, - StringRef Name, - std::pair<int, int> Default, - bool OnlyFirstRequired = false); +std::pair<unsigned, unsigned> +getIntegerPairAttribute(const Function &F, StringRef Name, + std::pair<unsigned, unsigned> Default, + bool OnlyFirstRequired = false); /// Represents the counter values to wait for in an s_waitcnt instruction. /// @@ -847,11 +864,6 @@ struct Waitcnt { return VsCnt != ~0u; } - bool dominates(const Waitcnt &Other) const { - return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt && - LgkmCnt <= Other.LgkmCnt && VsCnt <= Other.VsCnt; - } - Waitcnt combined(const Waitcnt &Other) const { return Waitcnt(std::min(VmCnt, Other.VmCnt), std::min(ExpCnt, Other.ExpCnt), std::min(LgkmCnt, Other.LgkmCnt), @@ -965,6 +977,33 @@ bool isSymbolicDepCtrEncoding(unsigned Code, bool &HasNonDefaultVal, bool decodeDepCtr(unsigned Code, int &Id, StringRef &Name, unsigned &Val, bool &IsDefault, const MCSubtargetInfo &STI); +/// \returns Decoded VaVdst from given immediate \p Encoded. +unsigned decodeFieldVaVdst(unsigned Encoded); + +/// \returns Decoded VmVsrc from given immediate \p Encoded. +unsigned decodeFieldVmVsrc(unsigned Encoded); + +/// \returns Decoded SaSdst from given immediate \p Encoded. +unsigned decodeFieldSaSdst(unsigned Encoded); + +/// \returns \p VmVsrc as an encoded Depctr immediate. +unsigned encodeFieldVmVsrc(unsigned VmVsrc); + +/// \returns \p Encoded combined with encoded \p VmVsrc. +unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc); + +/// \returns \p VaVdst as an encoded Depctr immediate. +unsigned encodeFieldVaVdst(unsigned VaVdst); + +/// \returns \p Encoded combined with encoded \p VaVdst. +unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst); + +/// \returns \p SaSdst as an encoded Depctr immediate. +unsigned encodeFieldSaSdst(unsigned SaSdst); + +/// \returns \p Encoded combined with encoded \p SaSdst. +unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst); + } // namespace DepCtr namespace Exp { @@ -1102,6 +1141,7 @@ bool hasMIMG_R128(const MCSubtargetInfo &STI); bool hasA16(const MCSubtargetInfo &STI); bool hasG16(const MCSubtargetInfo &STI); bool hasPackedD16(const MCSubtargetInfo &STI); +unsigned getNSAMaxSize(const MCSubtargetInfo &STI); bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); @@ -1162,6 +1202,9 @@ unsigned getRegBitWidth(unsigned RCID); /// Get the size in bits of a register from the register class \p RC. unsigned getRegBitWidth(const MCRegisterClass &RC); +/// Get the size in bits of a register from the register class \p RC. +unsigned getRegBitWidth(const TargetRegisterClass &RC); + /// Get size of register operand unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc, unsigned OpNo); @@ -1244,6 +1287,8 @@ bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi); bool isArgPassedInSGPR(const Argument *Arg); +bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo); + LLVM_READONLY bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST, int64_t EncodedOffset); @@ -1282,10 +1327,6 @@ unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST); /// not the encoded offset. bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); -bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, - const GCNSubtarget *Subtarget, - Align Alignment = Align(4)); - LLVM_READNONE inline bool isLegal64BitDPPControl(unsigned DC) { return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST; @@ -1294,109 +1335,8 @@ inline bool isLegal64BitDPPControl(unsigned DC) { /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); -// Track defaults for fields in the MODE register. -struct SIModeRegisterDefaults { - /// Floating point opcodes that support exception flag gathering quiet and - /// propagate signaling NaN inputs per IEEE 754-2008. Min_dx10 and max_dx10 - /// become IEEE 754- 2008 compliant due to signaling NaN propagation and - /// quieting. - bool IEEE : 1; - - /// Used by the vector ALU to force DX10-style treatment of NaNs: when set, - /// clamp NaN to zero; otherwise, pass NaN through. - bool DX10Clamp : 1; - - /// If this is set, neither input or output denormals are flushed for most f32 - /// instructions. - DenormalMode FP32Denormals; - - /// If this is set, neither input or output denormals are flushed for both f64 - /// and f16/v2f16 instructions. - DenormalMode FP64FP16Denormals; - - SIModeRegisterDefaults() : - IEEE(true), - DX10Clamp(true), - FP32Denormals(DenormalMode::getIEEE()), - FP64FP16Denormals(DenormalMode::getIEEE()) {} - - SIModeRegisterDefaults(const Function &F); - - static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) { - SIModeRegisterDefaults Mode; - Mode.IEEE = !AMDGPU::isShader(CC); - return Mode; - } - - bool operator ==(const SIModeRegisterDefaults Other) const { - return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp && - FP32Denormals == Other.FP32Denormals && - FP64FP16Denormals == Other.FP64FP16Denormals; - } - - bool allFP32Denormals() const { - return FP32Denormals == DenormalMode::getIEEE(); - } - - bool allFP64FP16Denormals() const { - return FP64FP16Denormals == DenormalMode::getIEEE(); - } - - /// Get the encoding value for the FP_DENORM bits of the mode register for the - /// FP32 denormal mode. - uint32_t fpDenormModeSPValue() const { - if (FP32Denormals == DenormalMode::getPreserveSign()) - return FP_DENORM_FLUSH_IN_FLUSH_OUT; - if (FP32Denormals.Output == DenormalMode::PreserveSign) - return FP_DENORM_FLUSH_OUT; - if (FP32Denormals.Input == DenormalMode::PreserveSign) - return FP_DENORM_FLUSH_IN; - return FP_DENORM_FLUSH_NONE; - } - - /// Get the encoding value for the FP_DENORM bits of the mode register for the - /// FP64/FP16 denormal mode. - uint32_t fpDenormModeDPValue() const { - if (FP64FP16Denormals == DenormalMode::getPreserveSign()) - return FP_DENORM_FLUSH_IN_FLUSH_OUT; - if (FP64FP16Denormals.Output == DenormalMode::PreserveSign) - return FP_DENORM_FLUSH_OUT; - if (FP64FP16Denormals.Input == DenormalMode::PreserveSign) - return FP_DENORM_FLUSH_IN; - return FP_DENORM_FLUSH_NONE; - } - - /// Returns true if a flag is compatible if it's enabled in the callee, but - /// disabled in the caller. - static bool oneWayCompatible(bool CallerMode, bool CalleeMode) { - return CallerMode == CalleeMode || (!CallerMode && CalleeMode); - } - - // FIXME: Inlining should be OK for dx10-clamp, since the caller's mode should - // be able to override. - bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const { - if (DX10Clamp != CalleeMode.DX10Clamp) - return false; - if (IEEE != CalleeMode.IEEE) - return false; - - // Allow inlining denormals enabled into denormals flushed functions. - return oneWayCompatible(FP64FP16Denormals.Input != - DenormalMode::PreserveSign, - CalleeMode.FP64FP16Denormals.Input != - DenormalMode::PreserveSign) && - oneWayCompatible(FP64FP16Denormals.Output != - DenormalMode::PreserveSign, - CalleeMode.FP64FP16Denormals.Output != - DenormalMode::PreserveSign) && - oneWayCompatible(FP32Denormals.Input != DenormalMode::PreserveSign, - CalleeMode.FP32Denormals.Input != - DenormalMode::PreserveSign) && - oneWayCompatible(FP32Denormals.Output != DenormalMode::PreserveSign, - CalleeMode.FP32Denormals.Output != - DenormalMode::PreserveSign); - } -}; +/// \returns true if the intrinsic is uniform +bool isIntrinsicAlwaysUniform(unsigned IntrID); } // end namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp index b1418253fd13..cbdbf1c16f9f 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp @@ -31,65 +31,25 @@ Align getAlign(DataLayout const &DL, const GlobalVariable *GV) { GV->getValueType()); } -static bool shouldLowerLDSToStruct(const GlobalVariable &GV, - const Function *F) { - // We are not interested in kernel LDS lowering for module LDS itself. - if (F && GV.getName() == "llvm.amdgcn.module.lds") +bool isDynamicLDS(const GlobalVariable &GV) { + // external zero size addrspace(3) without initializer implies cuda/hip extern + // __shared__ the semantics for such a variable appears to be that all extern + // __shared__ variables alias one another. This hits different handling. + const Module *M = GV.getParent(); + const DataLayout &DL = M->getDataLayout(); + if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { return false; - - bool Ret = false; - SmallPtrSet<const User *, 8> Visited; - SmallVector<const User *, 16> Stack(GV.users()); - - assert(!F || isKernelCC(F)); - - while (!Stack.empty()) { - const User *V = Stack.pop_back_val(); - Visited.insert(V); - - if (isa<GlobalValue>(V)) { - // This use of the LDS variable is the initializer of a global variable. - // This is ill formed. The address of an LDS variable is kernel dependent - // and unknown until runtime. It can't be written to a global variable. - continue; - } - - if (auto *I = dyn_cast<Instruction>(V)) { - const Function *UF = I->getFunction(); - if (UF == F) { - // Used from this kernel, we want to put it into the structure. - Ret = true; - } else if (!F) { - // For module LDS lowering, lowering is required if the user instruction - // is from non-kernel function. - Ret |= !isKernelCC(UF); - } - continue; - } - - // User V should be a constant, recursively visit users of V. - assert(isa<Constant>(V) && "Expected a constant."); - append_range(Stack, V->users()); } - - return Ret; + uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); + return GV.hasExternalLinkage() && AllocSize == 0; } bool isLDSVariableToLower(const GlobalVariable &GV) { if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { return false; } - if (!GV.hasInitializer()) { - // addrspace(3) without initializer implies cuda/hip extern __shared__ - // the semantics for such a variable appears to be that all extern - // __shared__ variables alias one another, in which case this transform - // is not required - return false; - } - if (!isa<UndefValue>(GV.getInitializer())) { - // Initializers are unimplemented for LDS address space. - // Leave such variables in place for consistent error reporting. - return false; + if (isDynamicLDS(GV)) { + return true; } if (GV.isConstant()) { // A constant undef variable can't be written to, and any load is @@ -97,22 +57,12 @@ bool isLDSVariableToLower(const GlobalVariable &GV) { // dropped by the back end if not. This pass skips over it. return false; } - return true; -} - -std::vector<GlobalVariable *> findLDSVariablesToLower(Module &M, - const Function *F) { - std::vector<llvm::GlobalVariable *> LocalVars; - for (auto &GV : M.globals()) { - if (!isLDSVariableToLower(GV)) { - continue; - } - if (!shouldLowerLDSToStruct(GV, F)) { - continue; - } - LocalVars.push_back(&GV); + if (GV.hasInitializer() && !isa<UndefValue>(GV.getInitializer())) { + // Initializers are unimplemented for LDS address space. + // Leave such variables in place for consistent error reporting. + return false; } - return LocalVars; + return true; } bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h index 92373fc14a98..df37c420fa72 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h @@ -28,9 +28,8 @@ namespace AMDGPU { Align getAlign(DataLayout const &DL, const GlobalVariable *GV); +bool isDynamicLDS(const GlobalVariable &GV); bool isLDSVariableToLower(const GlobalVariable &GV); -std::vector<GlobalVariable *> findLDSVariablesToLower(Module &M, - const Function *F); /// Given a \p Def clobbering a load from \p Ptr according to the MSSA check /// if this is actually a memory update or an artificial clobber to facilitate diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index 4ad93f7b0b68..a92d574b1848 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -811,6 +811,38 @@ msgpack::MapDocNode AMDGPUPALMetadata::getShaderFunction(StringRef Name) { return Functions[Name].getMap(/*Convert=*/true); } +msgpack::DocNode &AMDGPUPALMetadata::refComputeRegisters() { + auto &N = + MsgPackDoc.getRoot() + .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")] + .getArray(/*Convert=*/true)[0] + .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".compute_registers")]; + N.getMap(/*Convert=*/true); + return N; +} + +msgpack::MapDocNode AMDGPUPALMetadata::getComputeRegisters() { + if (ComputeRegisters.isEmpty()) + ComputeRegisters = refComputeRegisters(); + return ComputeRegisters.getMap(); +} + +msgpack::DocNode &AMDGPUPALMetadata::refGraphicsRegisters() { + auto &N = + MsgPackDoc.getRoot() + .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")] + .getArray(/*Convert=*/true)[0] + .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".graphics_registers")]; + N.getMap(/*Convert=*/true); + return N; +} + +msgpack::MapDocNode AMDGPUPALMetadata::getGraphicsRegisters() { + if (GraphicsRegisters.isEmpty()) + GraphicsRegisters = refGraphicsRegisters(); + return GraphicsRegisters.getMap(); +} + // Return the PAL metadata hardware shader stage name. static const char *getStageName(CallingConv::ID CC) { switch (CC) { @@ -833,15 +865,21 @@ static const char *getStageName(CallingConv::ID CC) { } } +msgpack::DocNode &AMDGPUPALMetadata::refHwStage() { + auto &N = + MsgPackDoc.getRoot() + .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")] + .getArray(/*Convert=*/true)[0] + .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".hardware_stages")]; + N.getMap(/*Convert=*/true); + return N; +} + // Get (create if necessary) the .hardware_stages entry for the given calling // convention. msgpack::MapDocNode AMDGPUPALMetadata::getHwStage(unsigned CC) { if (HwStages.isEmpty()) - HwStages = MsgPackDoc.getRoot() - .getMap(/*Convert=*/true)["amdpal.pipelines"] - .getArray(/*Convert=*/true)[0] - .getMap(/*Convert=*/true)[".hardware_stages"] - .getMap(/*Convert=*/true); + HwStages = refHwStage(); return HwStages.getMap()[getStageName(CC)].getMap(/*Convert=*/true); } @@ -874,3 +912,78 @@ void AMDGPUPALMetadata::reset() { Registers = MsgPackDoc.getEmptyNode(); HwStages = MsgPackDoc.getEmptyNode(); } + +unsigned AMDGPUPALMetadata::getPALVersion(unsigned idx) { + assert(idx < 2 && + "illegal index to PAL version - should be 0 (major) or 1 (minor)"); + if (!VersionChecked) { + if (Version.isEmpty()) { + auto &M = MsgPackDoc.getRoot().getMap(/*Convert=*/true); + auto I = M.find(MsgPackDoc.getNode("amdpal.version")); + if (I != M.end()) + Version = I->second; + } + VersionChecked = true; + } + if (Version.isEmpty()) + // Default to 2.6 if there's no version info + return idx ? 6 : 2; + return Version.getArray()[idx].getUInt(); +} + +unsigned AMDGPUPALMetadata::getPALMajorVersion() { return getPALVersion(0); } + +unsigned AMDGPUPALMetadata::getPALMinorVersion() { return getPALVersion(1); } + +// Set the field in a given .hardware_stages entry +void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field, unsigned Val) { + getHwStage(CC)[field] = Val; +} + +void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field, bool Val) { + getHwStage(CC)[field] = Val; +} + +void AMDGPUPALMetadata::setComputeRegisters(StringRef field, unsigned Val) { + getComputeRegisters()[field] = Val; +} + +void AMDGPUPALMetadata::setComputeRegisters(StringRef field, bool Val) { + getComputeRegisters()[field] = Val; +} + +msgpack::DocNode *AMDGPUPALMetadata::refComputeRegister(StringRef field) { + auto M = getComputeRegisters(); + auto I = M.find(field); + return I == M.end() ? nullptr : &I->second; +} + +bool AMDGPUPALMetadata::checkComputeRegisters(StringRef field, unsigned Val) { + if (auto N = refComputeRegister(field)) + return N->getUInt() == Val; + return false; +} + +bool AMDGPUPALMetadata::checkComputeRegisters(StringRef field, bool Val) { + if (auto N = refComputeRegister(field)) + return N->getBool() == Val; + return false; +} + +void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field, unsigned Val) { + getGraphicsRegisters()[field] = Val; +} + +void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field, bool Val) { + getGraphicsRegisters()[field] = Val; +} + +void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field1, StringRef field2, + unsigned Val) { + getGraphicsRegisters()[field1].getMap(true)[field2] = Val; +} + +void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field1, StringRef field2, + bool Val) { + getGraphicsRegisters()[field1].getMap(true)[field2] = Val; +} diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h index a45a799e38a9..e477904cb81f 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h @@ -27,6 +27,11 @@ class AMDGPUPALMetadata { msgpack::DocNode Registers; msgpack::DocNode HwStages; msgpack::DocNode ShaderFunctions; + bool VersionChecked = false; + msgpack::DocNode Version; + // From PAL version >= 3.0 + msgpack::DocNode ComputeRegisters; + msgpack::DocNode GraphicsRegisters; public: // Read the amdgpu.pal.metadata supplied by the frontend, ready for @@ -129,6 +134,26 @@ public: // Set legacy PAL metadata format. void setLegacy(); + unsigned getPALMajorVersion(); + unsigned getPALMinorVersion(); + + void setHwStage(unsigned CC, StringRef field, unsigned Val); + void setHwStage(unsigned CC, StringRef field, bool Val); + + void setComputeRegisters(StringRef field, unsigned Val); + void setComputeRegisters(StringRef field, bool Val); + + // If the field does not exist will return nullptr rather than creating a new + // entry (which is the behaviour of the other functions). + msgpack::DocNode *refComputeRegister(StringRef field); + bool checkComputeRegisters(StringRef field, unsigned Val); + bool checkComputeRegisters(StringRef field, bool Val); + + void setGraphicsRegisters(StringRef field, unsigned Val); + void setGraphicsRegisters(StringRef field, bool Val); + void setGraphicsRegisters(StringRef field1, StringRef field2, unsigned Val); + void setGraphicsRegisters(StringRef field1, StringRef field2, bool Val); + // Erase all PAL metadata. void reset(); @@ -151,10 +176,29 @@ private: // Get (create if necessary) a function in the shader functions map. msgpack::MapDocNode getShaderFunction(StringRef Name); + // Reference (create if necessary) the node for the compute_registers map. + msgpack::DocNode &refComputeRegisters(); + + // Get (create if necessary) the .compute_registers entry. + msgpack::MapDocNode getComputeRegisters(); + + // Reference (create if necessary) the node for the graphics registers map. + msgpack::DocNode &refGraphicsRegisters(); + + // Get (create if necessary) the .graphics_registers entry. + msgpack::MapDocNode getGraphicsRegisters(); + + // Reference (create if necessary) the node for the hardware_stages map. + msgpack::DocNode &refHwStage(); + // Get (create if necessary) the .hardware_stages entry for the given calling // convention. msgpack::MapDocNode getHwStage(unsigned CC); + // Get the PAL version major (idx 0) or minor (idx 1). This is an internal + // helper for the public wrapper functions that request Major or Minor + unsigned getPALVersion(unsigned idx); + bool setFromLegacyBlob(StringRef Blob); bool setFromMsgPackBlob(StringRef Blob); void toLegacyBlob(std::string &Blob); diff --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td index 71de20223e9f..7d03150bf5b1 100644 --- a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td @@ -23,7 +23,6 @@ class VINTERPe_gfx11 <bits<7> op, VOPProfile P> : Enc64 { let Inst{31-26} = 0x33; // VOP3P encoding let Inst{25-24} = 0x1; // VINTERP sub-encoding - let Inst{23} = 0; // reserved let Inst{7-0} = vdst; let Inst{10-8} = waitexp; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 285499ad6984..1a8efc6e3df2 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -120,28 +120,28 @@ multiclass VOP1Inst <string opName, VOPProfile P, def _e32 : VOP1_Pseudo <opName, P>; else // Only for V_MOV_B32 - def _e32 : VOP1_Pseudo <opName, P>, VOPD_Component<VOPDOp, "v_mov_b32">; + def _e32 : VOP1_Pseudo <opName, P>, VOPD_Component<VOPDOp, opName>; def _e64 : VOP3InstBase <opName, P, node>; } - foreach _ = BoolToList<P.HasExtSDWA>.ret in + if P.HasExtSDWA then def _sdwa : VOP1_SDWA_Pseudo <opName, P>; - foreach _ = BoolToList<P.HasExtDPP>.ret in + if P.HasExtDPP then def _dpp : VOP1_DPP_Pseudo <opName, P>; let SubtargetPredicate = isGFX11Plus in { - foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in + if P.HasExtVOP3DPP then def _e64_dpp : VOP3_DPP_Pseudo <opName, P>; } // End SubtargetPredicate = isGFX11Plus def : MnemonicAlias<opName#"_e32", opName>, LetDummies; def : MnemonicAlias<opName#"_e64", opName>, LetDummies; - foreach _ = BoolToList<P.HasExtSDWA>.ret in + if P.HasExtSDWA then def : MnemonicAlias<opName#"_sdwa", opName>, LetDummies; - foreach _ = BoolToList<P.HasExtDPP>.ret in + if P.HasExtDPP then def : MnemonicAlias<opName#"_dpp", opName, AMDGPUAsmVariants.DPP>, LetDummies; } @@ -229,9 +229,9 @@ defm V_MOV_B64 : VOP1Inst <"v_mov_b64", VOP_I64_I64>; // TODO: Make profile for this, there is VOP3 encoding also def V_READFIRSTLANE_B32 : InstSI <(outs SReg_32:$vdst), - (ins VRegOrLds_32:$src0), + (ins VRegOrLdsSrc_32:$src0), "v_readfirstlane_b32 $vdst, $src0", - [(set i32:$vdst, (int_amdgcn_readfirstlane (i32 VRegOrLds_32:$src0)))]>, + [(set i32:$vdst, (int_amdgcn_readfirstlane (i32 VRegOrLdsSrc_32:$src0)))]>, Enc32 { let isCodeGenOnly = 0; @@ -266,7 +266,7 @@ defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>; } defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>; -defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>; +defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, any_fpextend>; // OMod clears exceptions when set in this instruction defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64_SPECIAL_OMOD, fp_to_uint>; @@ -290,15 +290,15 @@ defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32_SPECIAL_OMOD, fp_to_ defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32_SPECIAL_OMOD, fp_to_sint>; let FPDPRounding = 1, isReMaterializable = 0 in { let OtherPredicates = [NotHasTrue16BitInsts] in - defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>; + defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, any_fpround>; let OtherPredicates = [HasTrue16BitInsts] in - defm V_CVT_F16_F32_t16 : VOP1Inst <"v_cvt_f16_f32_t16", VOPProfile_True16<VOP_F16_F32>, fpround>; + defm V_CVT_F16_F32_t16 : VOP1Inst <"v_cvt_f16_f32_t16", VOPProfile_True16<VOP_F16_F32>, any_fpround>; } // End FPDPRounding = 1, isReMaterializable = 0 let OtherPredicates = [NotHasTrue16BitInsts] in -defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>; +defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, any_fpextend>; let OtherPredicates = [HasTrue16BitInsts] in -defm V_CVT_F32_F16_t16 : VOP1Inst <"v_cvt_f32_f16_t16", VOPProfile_True16<VOP_F32_F16>, fpextend>; +defm V_CVT_F32_F16_t16 : VOP1Inst <"v_cvt_f32_f16_t16", VOPProfile_True16<VOP_F32_F16>, any_fpextend>; let ReadsModeReg = 0, mayRaiseFPException = 0 in { defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>; @@ -321,8 +321,8 @@ defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>; defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>; let TRANS = 1, SchedRW = [WriteTrans32] in { -defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>; -defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>; +defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, AMDGPUexp>; +defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, AMDGPUlog>; defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>; defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>; defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>; @@ -332,7 +332,7 @@ defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, any_amdgcn_sqrt>; let TRANS = 1, SchedRW = [WriteTrans64] in { defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>; defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>; -defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, any_amdgcn_sqrt>; +defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>; } // End TRANS = 1, SchedRW = [WriteTrans64] let TRANS = 1, SchedRW = [WriteTrans32] in { @@ -487,8 +487,8 @@ let TRANS = 1, SchedRW = [WriteTrans32] in { defm V_RCP_F16 : VOP1Inst_t16 <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>; defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>; defm V_RSQ_F16 : VOP1Inst_t16 <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>; -defm V_LOG_F16 : VOP1Inst_t16 <"v_log_f16", VOP_F16_F16, flog2>; -defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, fexp2>; +defm V_LOG_F16 : VOP1Inst_t16 <"v_log_f16", VOP_F16_F16, AMDGPUlogf16>; +defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, AMDGPUexpf16>; defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>; defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; } // End TRANS = 1, SchedRW = [WriteTrans32] @@ -528,13 +528,10 @@ def : GCNPat< >; } -def VOP_SWAP_I32 : VOPProfile<[i32, i32, i32, untyped]> { - let Outs32 = (outs VGPR_32:$vdst, VGPR_32:$vdst1); - let Ins32 = (ins VGPR_32:$src0, VGPR_32:$src1); - let Outs64 = Outs32; +def VOP_SWAP_I32 : VOPProfile<[i32, i32, untyped, untyped]> { + let Outs32 = (outs VGPR_32:$vdst, VRegSrc_32:$vdst1); + let Ins32 = (ins VRegSrc_32:$src0, VGPR_32:$src1); let Asm32 = " $vdst, $src0"; - let Asm64 = ""; - let Ins64 = (ins); } let SubtargetPredicate = isGFX9Plus in { @@ -633,7 +630,7 @@ let SubtargetPredicate = isGFX10Plus in { def VOPProfileAccMov : VOP_NO_EXT<VOP_I32_I32> { let DstRC = RegisterOperand<AGPR_32>; - let Src0RC32 = RegisterOperand<AGPR_32>; + let Src0RC32 = ARegSrc_32; let Asm32 = " $vdst, $src0"; } @@ -847,7 +844,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { VOP3e_gfx10<{0, 1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; } multiclass VOP1_Real_sdwa_gfx10<bits<9> op> { - foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in + if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx10 : VOP_SDWA10_Real<!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWA9Ae<op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { @@ -855,13 +852,13 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } } multiclass VOP1_Real_dpp_gfx10<bits<9> op> { - foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in + if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX10> { let DecoderNamespace = "SDWA10"; } } multiclass VOP1_Real_dpp8_gfx10<bits<9> op> { - foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in + if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> { let DecoderNamespace = "DPP8"; } @@ -1067,17 +1064,17 @@ multiclass VOP1_Real_e32e64_vi <bits<10> op> { multiclass VOP1_Real_vi <bits<10> op> { defm NAME : VOP1_Real_e32e64_vi <op>; - foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in + if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA then def _sdwa_vi : VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; - foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in + if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx9 : VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; - foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then def _dpp_vi : VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>, VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>; @@ -1241,12 +1238,12 @@ multiclass VOP1_Real_gfx9 <bits<10> op> { defm NAME : VOP1_Real_e32e64_vi <op>; } - foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in + if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx9 : VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; - foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then def _dpp_gfx9 : VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>; @@ -1258,14 +1255,14 @@ multiclass VOP1_Real_NoDstSel_SDWA_gfx9 <bits<10> op> { defm NAME : VOP1_Real_e32e64_vi <op>; } - foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in + if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx9 : VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { let Inst{42-40} = 6; } - foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then def _dpp_gfx9 : VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>; diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index a1f99ca3aefa..481a162748e6 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -157,7 +157,7 @@ multiclass VOP2Inst_e64<string opName, Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; let SubtargetPredicate = isGFX11Plus in { - foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in + if P.HasExtVOP3DPP then def _e64_dpp : VOP3_DPP_Pseudo <opName, P>; } // End SubtargetPredicate = isGFX11Plus } // End renamedInGFX9 = GFX9Renamed @@ -167,7 +167,7 @@ multiclass VOP2Inst_sdwa<string opName, VOPProfile P, bit GFX9Renamed = 0> { let renamedInGFX9 = GFX9Renamed in { - foreach _ = BoolToList<P.HasExtSDWA>.ret in + if P.HasExtSDWA then def _sdwa : VOP2_SDWA_Pseudo <opName, P>; } // End renamedInGFX9 = GFX9Renamed } @@ -181,7 +181,7 @@ multiclass VOP2Inst<string opName, VOP2Inst_e64<opName, P, node, revOp, GFX9Renamed>, VOP2Inst_sdwa<opName, P, GFX9Renamed> { let renamedInGFX9 = GFX9Renamed in { - foreach _ = BoolToList<P.HasExtDPP>.ret in + if P.HasExtDPP then def _dpp : VOP2_DPP_Pseudo <opName, P>; } } @@ -227,7 +227,7 @@ multiclass VOP2Inst_VOPD<string opName, VOP2Inst_e64<opName, P, node, revOp, GFX9Renamed>, VOP2Inst_sdwa<opName, P, GFX9Renamed> { let renamedInGFX9 = GFX9Renamed in { - foreach _ = BoolToList<P.HasExtDPP>.ret in + if P.HasExtDPP then def _dpp : VOP2_DPP_Pseudo <opName, P>; } } @@ -246,11 +246,11 @@ multiclass VOP2bInst <string opName, let usesCustomInserter = true; } - foreach _ = BoolToList<P.HasExtSDWA>.ret in + if P.HasExtSDWA then def _sdwa : VOP2_SDWA_Pseudo <opName, P> { let AsmMatchConverter = "cvtSdwaVOP2b"; } - foreach _ = BoolToList<P.HasExtDPP>.ret in + if P.HasExtDPP then def _dpp : VOP2_DPP_Pseudo <opName, P>; } // End Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] @@ -258,7 +258,7 @@ multiclass VOP2bInst <string opName, Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; let SubtargetPredicate = isGFX11Plus in { - foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in + if P.HasExtVOP3DPP then def _e64_dpp : VOP3_DPP_Pseudo <opName, P>; } // End SubtargetPredicate = isGFX11Plus } @@ -297,12 +297,12 @@ multiclass Commutable_REV<revOp#"_e32", !eq(revOp, opName)>, VOPD_Component<VOPDOp, VOPDName>; - foreach _ = BoolToList<P.HasExtSDWA>.ret in + if P.HasExtSDWA then def _sdwa : VOP2_SDWA_Pseudo <opName, P> { let AsmMatchConverter = "cvtSdwaVOP2e"; } - foreach _ = BoolToList<P.HasExtDPP>.ret in + if P.HasExtDPP then def _dpp : VOP2_DPP_Pseudo <opName, P>; } @@ -312,7 +312,7 @@ multiclass } let SubtargetPredicate = isGFX11Plus in { - foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in + if P.HasExtVOP3DPP then def _e64_dpp : VOP3_DPP_Pseudo <opName, P>; } // End SubtargetPredicate = isGFX11Plus } @@ -357,7 +357,7 @@ class VOP_MADK_Base<ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { } class VOP_MADAK <ValueType vt> : VOP_MADK_Base<vt> { - field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); + field Operand ImmOpType = !if(!eq(vt.Size, 32), KImmFP32, KImmFP16); field dag Ins32 = !if(!eq(vt.Size, 32), (ins VSrc_f32_Deferred:$src0, VGPR_32:$src1, ImmOpType:$imm), (ins VSrc_f16_Deferred:$src0, VGPR_32:$src1, ImmOpType:$imm)); @@ -383,7 +383,7 @@ def VOP_MADAK_F16_t16 : VOP_MADAK <f16> { def VOP_MADAK_F32 : VOP_MADAK <f32>; class VOP_MADMK <ValueType vt> : VOP_MADK_Base<vt> { - field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); + field Operand ImmOpType = !if(!eq(vt.Size, 32), KImmFP32, KImmFP16); field dag Ins32 = !if(!eq(vt.Size, 32), (ins VSrc_f32_Deferred:$src0, ImmOpType:$imm, VGPR_32:$src1), (ins VSrc_f16_Deferred:$src0, ImmOpType:$imm, VGPR_32:$src1)); @@ -660,7 +660,7 @@ def VOP2e_I16_I16_I16_I1 : VOP2e_SGPR<[i16, i16, i16, i1]>; def VOP_READLANE : VOPProfile<[i32, i32, i32, untyped]> { let Outs32 = (outs SReg_32:$vdst); let Outs64 = Outs32; - let Ins32 = (ins VRegOrLds_32:$src0, SCSrc_b32:$src1); + let Ins32 = (ins VRegOrLdsSrc_32:$src0, SCSrc_b32:$src1); let Ins64 = Ins32; let Asm32 = " $vdst, $src0, $src1"; let Asm64 = Asm32; @@ -764,19 +764,20 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, let isConvergent = 1, Uses = []<Register> in { def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>; - -let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { +let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]>; -} // End $vdst = $vdst_in, DisableEncoding $vdst_in +} // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in } // End isConvergent = 1 let isReMaterializable = 1 in { defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>; defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32, add_ctpop>; +let IsNeverUniform = 1 in { defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_lo>; defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_hi>; -defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>; +} // End IsNeverUniform = 1 +defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, any_fldexp>; let ReadsModeReg = 0, mayRaiseFPException = 0 in { defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_i16_f32>; @@ -862,9 +863,18 @@ def : divergent_i64_BinOp <xor, V_XOR_B32_e64>; // 16-Bit Operand Instructions //===----------------------------------------------------------------------===// -def LDEXP_F16_VOPProfile_True16 : VOPProfile_True16<VOP_F16_F16_I32> { - // The ldexp.f16 intrinsic expects a i32 src1 operand, though the hardware - // encoding treats src1 as an f16 +// The ldexp.f16 intrinsic expects a integer src1 operand, though the hardware +// encoding treats src1 as an f16 +def LDEXP_F16_VOPProfile : VOPProfile <[f16, f16, f16, untyped]> { + let Src1Mod = Int32InputMods; + let Src1ModDPP = IntVRegInputMods; + let Src1ModVOP3DPP = IntVRegInputMods; + // SDWA sext is the only modifier allowed. + let HasSrc1IntMods = 1; + let HasSrc1FloatMods = 0; + let Src1ModSDWA = Int16SDWAInputMods; +} +def LDEXP_F16_VOPProfile_True16 : VOPProfile_True16<VOP_F16_F16_F16> { let Src1RC32 = RegisterOperand<VGPR_32_Lo128>; let Src1DPP = VGPR_32_Lo128; let Src1ModDPP = IntT16VRegInputMods; @@ -873,9 +883,9 @@ def LDEXP_F16_VOPProfile_True16 : VOPProfile_True16<VOP_F16_F16_I32> { let isReMaterializable = 1 in { let FPDPRounding = 1 in { let SubtargetPredicate = NotHasTrue16BitInsts, OtherPredicates = [Has16BitInsts] in - defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>; + defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", LDEXP_F16_VOPProfile>; let SubtargetPredicate = HasTrue16BitInsts in - defm V_LDEXP_F16_t16 : VOP2Inst <"v_ldexp_f16_t16", LDEXP_F16_VOPProfile_True16, AMDGPUldexp>; + defm V_LDEXP_F16_t16 : VOP2Inst <"v_ldexp_f16_t16", LDEXP_F16_VOPProfile_True16>; } // End FPDPRounding = 1 // FIXME VOP3 Only instructions. NFC using VOPProfile_True16 for these until a planned change to use a new register class for VOP3 encoded True16 instuctions defm V_LSHLREV_B16 : VOP2Inst_e64_t16 <"v_lshlrev_b16", VOP_I16_I16_I16, clshl_rev_16>; @@ -898,6 +908,21 @@ defm V_MIN_I16 : VOP2Inst_e64_t16 <"v_min_i16", VOP_I16_I16_I16, smin>; } // End isCommutable = 1 } // End isReMaterializable = 1 +class LDEXP_F16_Pat <SDPatternOperator op, VOP_Pseudo inst, VOPProfile P = inst.Pfl> : GCNPat < + (P.DstVT (op (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (i16 (VOP3Mods0 P.Src1VT:$src1, i32:$src1_modifiers)))), + (inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + $clamp, /* clamp */ + $omod /* omod */) +>; + +let OtherPredicates = [NotHasTrue16BitInsts] in +def : LDEXP_F16_Pat<any_fldexp, V_LDEXP_F16_e64>; + +let OtherPredicates = [HasTrue16BitInsts] in +def : LDEXP_F16_Pat<any_fldexp, V_LDEXP_F16_t16_e64>; + let SubtargetPredicate = isGFX11Plus in { let isCommutable = 1 in { defm V_AND_B16_t16 : VOP2Inst_e64 <"v_and_b16_t16", VOPProfile_True16<VOP_I16_I16_I16>, and>; @@ -1266,13 +1291,13 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in { VOP3e_gfx11<{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; } multiclass VOP2_Real_dpp_gfx11<bits<6> op> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then def _dpp_gfx11 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX11> { let DecoderNamespace = "DPPGFX11"; } } multiclass VOP2_Real_dpp8_gfx11<bits<6> op> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then def _dpp8_gfx11 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> { let DecoderNamespace = "DPP8GFX11"; } @@ -1302,7 +1327,7 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in { multiclass VOP2_Real_dpp_with_name_gfx11<bits<6> op, string opName, string asmName> { defvar ps = !cast<VOP2_Pseudo>(opName#"_e32"); - foreach _ = BoolToList<ps.Pfl.HasExtDPP>.ret in + if ps.Pfl.HasExtDPP then def _dpp_gfx11 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX11> { let AsmString = asmName # ps.Pfl.AsmDPP16; @@ -1312,7 +1337,7 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in { multiclass VOP2_Real_dpp8_with_name_gfx11<bits<6> op, string opName, string asmName> { defvar ps = !cast<VOP2_Pseudo>(opName#"_e32"); - foreach _ = BoolToList<ps.Pfl.HasExtDPP>.ret in + if ps.Pfl.HasExtDPP then def _dpp8_gfx11 : VOP2_DPP8<op, ps> { let AsmString = asmName # ps.Pfl.AsmDPP8; let DecoderNamespace = "DPP8GFX11"; @@ -1329,14 +1354,14 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in { } } multiclass VOP2be_Real_dpp_gfx11<bits<6> op, string opName, string asmName> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then def _dpp_gfx11 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX11, asmName> { string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; let AsmString = asmName # !subst(", vcc", "", AsmDPP); let DecoderNamespace = "DPPGFX11"; } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then def _dpp_w32_gfx11 : Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> { string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; @@ -1344,7 +1369,7 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in { let isAsmParserOnly = 1; let WaveSizePredicate = isWave32; } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then def _dpp_w64_gfx11 : Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> { string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; @@ -1354,14 +1379,14 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in { } } multiclass VOP2be_Real_dpp8_gfx11<bits<6> op, string opName, string asmName> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then def _dpp8_gfx11 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; let AsmString = asmName # !subst(", vcc", "", AsmDPP8); let DecoderNamespace = "DPP8GFX11"; } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then def _dpp8_w32_gfx11 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; @@ -1369,7 +1394,7 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in { let isAsmParserOnly = 1; let WaveSizePredicate = isWave32; } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then def _dpp8_w64_gfx11 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; @@ -1477,19 +1502,19 @@ defm V_FMAMK_F16_t16 : VOP2Only_Real_MADK_gfx11_with_name<0x037, "v_fmamk_ defm V_FMAAK_F16_t16 : VOP2Only_Real_MADK_gfx11_with_name<0x038, "v_fmaak_f16">; // VOP3 only. -defm V_CNDMASK_B16 : VOP3Only_Realtriple_gfx11<0x25d>; -defm V_LDEXP_F32 : VOP3Only_Realtriple_gfx11<0x31c>; -defm V_BFM_B32 : VOP3Only_Realtriple_gfx11<0x31d>; -defm V_BCNT_U32_B32 : VOP3Only_Realtriple_gfx11<0x31e>; -defm V_MBCNT_LO_U32_B32 : VOP3Only_Realtriple_gfx11<0x31f>; -defm V_MBCNT_HI_U32_B32 : VOP3Only_Realtriple_gfx11<0x320>; -defm V_CVT_PKNORM_I16_F32 : VOP3Only_Realtriple_gfx11<0x321>; -defm V_CVT_PKNORM_U16_F32 : VOP3Only_Realtriple_gfx11<0x322>; -defm V_CVT_PK_U16_U32 : VOP3Only_Realtriple_gfx11<0x323>; -defm V_CVT_PK_I16_I32 : VOP3Only_Realtriple_gfx11<0x324>; -defm V_ADD_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x300>; -defm V_SUB_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x301>; -defm V_SUBREV_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x302>; +defm V_CNDMASK_B16 : VOP3Only_Realtriple_gfx11<0x25d>; +defm V_LDEXP_F32 : VOP3Only_Realtriple_gfx11<0x31c>; +defm V_BFM_B32 : VOP3Only_Realtriple_gfx11<0x31d>; +defm V_BCNT_U32_B32 : VOP3Only_Realtriple_gfx11<0x31e>; +defm V_MBCNT_LO_U32_B32 : VOP3Only_Realtriple_gfx11<0x31f>; +defm V_MBCNT_HI_U32_B32 : VOP3Only_Realtriple_gfx11<0x320>; +defm V_CVT_PK_NORM_I16_F32 : VOP3Only_Realtriple_with_name_gfx11<0x321, "V_CVT_PKNORM_I16_F32", "v_cvt_pk_norm_i16_f32">; +defm V_CVT_PK_NORM_U16_F32 : VOP3Only_Realtriple_with_name_gfx11<0x322, "V_CVT_PKNORM_U16_F32", "v_cvt_pk_norm_u16_f32">; +defm V_CVT_PK_U16_U32 : VOP3Only_Realtriple_gfx11<0x323>; +defm V_CVT_PK_I16_I32 : VOP3Only_Realtriple_gfx11<0x324>; +defm V_ADD_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x300>; +defm V_SUB_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x301>; +defm V_SUBREV_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x302>; let SubtargetPredicate = isGFX11Plus in { defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx11>; @@ -1533,7 +1558,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { VOP3e_gfx10<{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; } multiclass VOP2_Real_sdwa_gfx10<bits<6> op> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in + if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx10 : VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { @@ -1541,13 +1566,13 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } } multiclass VOP2_Real_dpp_gfx10<bits<6> op> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in + if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX10> { let DecoderNamespace = "SDWA10"; } } multiclass VOP2_Real_dpp8_gfx10<bits<6> op> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in + if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> { let DecoderNamespace = "DPP8"; } @@ -1576,7 +1601,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { let DecoderNamespace = "SDWA10" in { multiclass VOP2_Real_sdwa_gfx10_with_name<bits<6> op, string opName, string asmName> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx10 : VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { @@ -1586,7 +1611,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } multiclass VOP2_Real_dpp_gfx10_with_name<bits<6> op, string opName, string asmName> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX10> { VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32"); let AsmString = asmName # ps.Pfl.AsmDPP16; @@ -1594,7 +1619,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } multiclass VOP2_Real_dpp8_gfx10_with_name<bits<6> op, string opName, string asmName> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32"); let AsmString = asmName # ps.Pfl.AsmDPP8; @@ -1622,7 +1647,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } } multiclass VOP2be_Real_sdwa_gfx10<bits<6> op, string opName, string asmName> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx10 : VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { @@ -1630,7 +1655,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands); let DecoderNamespace = "SDWA10"; } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9 then def _sdwa_w32_gfx10 : Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { @@ -1640,7 +1665,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { let DecoderNamespace = "SDWA10"; let WaveSizePredicate = isWave32; } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9 then def _sdwa_w64_gfx10 : Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>, VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> { @@ -1652,14 +1677,14 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } } multiclass VOP2be_Real_dpp_gfx10<bits<6> op, string opName, string asmName> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX10, asmName> { string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; let AsmString = asmName # !subst(", vcc", "", AsmDPP); let DecoderNamespace = "SDWA10"; } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then def _dpp_w32_gfx10 : Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> { string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; @@ -1667,7 +1692,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { let isAsmParserOnly = 1; let WaveSizePredicate = isWave32; } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then def _dpp_w64_gfx10 : Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> { string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16; @@ -1677,14 +1702,14 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } } multiclass VOP2be_Real_dpp8_gfx10<bits<6> op, string opName, string asmName> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; let AsmString = asmName # !subst(", vcc", "", AsmDPP8); let DecoderNamespace = "DPP8"; } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then def _dpp8_w32_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; @@ -1692,7 +1717,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { let isAsmParserOnly = 1; let WaveSizePredicate = isWave32; } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in + if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then def _dpp8_w64_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> { string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8; @@ -2014,14 +2039,14 @@ multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> : } // End AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8" multiclass VOP2_SDWA_Real <bits<6> op> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in + if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA then def _sdwa_vi : VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; } multiclass VOP2_SDWA9_Real <bits<6> op> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in + if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx9 : VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; @@ -2044,14 +2069,14 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName let AsmString = AsmName # ps.AsmOperands; let DecoderNamespace = "GFX8"; } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA>.ret in + if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA then def _sdwa_vi : VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>, VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> { VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa"); let AsmString = AsmName # ps.AsmOperands; } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in + if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP then def _dpp_vi : VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.VI>, VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> { @@ -2078,14 +2103,14 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> { let AsmString = AsmName # ps.AsmOperands; let DecoderNamespace = "GFX9"; } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA9>.ret in + if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx9 : VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>, VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> { VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa"); let AsmString = AsmName # ps.AsmOperands; } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in + if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP then def _dpp_gfx9 : VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.GFX9>, VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> { @@ -2106,12 +2131,12 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> { VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { let DecoderNamespace = "GFX9"; } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in + if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx9 : VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { } - foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then def _dpp_gfx9 : VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> { @@ -2124,7 +2149,7 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> { multiclass VOP2_Real_e32e64_vi <bits<6> op> : Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then def _dpp_vi : VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>, VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>; @@ -2271,7 +2296,7 @@ let AssemblerPredicate = isGFX90APlus, DecoderNamespace = "GFX90A" in { multiclass VOP2_Real_e32e64_gfx90a <bits<6> op> : Base_VOP2_Real_e32e64_gfx90a<op> { - foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then def _dpp_gfx90a : VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX90A>, VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> { diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 848d1ad1f6c7..c0e0ac1b4ec8 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -61,7 +61,7 @@ class VOP3Interp<string OpName, VOPProfile P, list<dag> pattern = []> : def VOP3_INTERP : VOPProfile<[f32, f32, i32, untyped]> { let Src0Mod = FPVRegInputMods; let Ins64 = (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0, - Attr:$attr, AttrChan:$attrchan, + InterpAttr:$attr, InterpAttrChan:$attrchan, clampmod0:$clamp, omod0:$omod); let Asm64 = "$vdst, $src0_modifiers, $attr$attrchan$clamp$omod"; @@ -69,7 +69,7 @@ def VOP3_INTERP : VOPProfile<[f32, f32, i32, untyped]> { def VOP3_INTERP_MOV : VOPProfile<[f32, i32, i32, untyped]> { let Ins64 = (ins InterpSlot:$src0, - Attr:$attr, AttrChan:$attrchan, + InterpAttr:$attr, InterpAttrChan:$attrchan, clampmod0:$clamp, omod0:$omod); let Asm64 = "$vdst, $src0, $attr$attrchan$clamp$omod"; @@ -90,16 +90,16 @@ class getInterp16Ins <bit HasSrc2, bit HasOMod, dag ret = !if(HasSrc2, !if(HasOMod, (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0, - Attr:$attr, AttrChan:$attrchan, + InterpAttr:$attr, InterpAttrChan:$attrchan, Src2Mod:$src2_modifiers, VRegSrc_32:$src2, highmod:$high, clampmod0:$clamp, omod0:$omod), (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0, - Attr:$attr, AttrChan:$attrchan, + InterpAttr:$attr, InterpAttrChan:$attrchan, Src2Mod:$src2_modifiers, VRegSrc_32:$src2, highmod:$high, clampmod0:$clamp) ), (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0, - Attr:$attr, AttrChan:$attrchan, + InterpAttr:$attr, InterpAttrChan:$attrchan, highmod:$high, clampmod0:$clamp, omod0:$omod) ); } @@ -219,7 +219,7 @@ defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", DIV_FIXUP_F32_PROF, AMDGPUdi let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in { defm V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>; - defm V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp>; + defm V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, any_fldexp>; } // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1 } // End isReMaterializable = 1 @@ -263,7 +263,7 @@ let SchedRW = [Write64Bit] in { def : GCNPat< (i32 (DivergentUnaryFrag<sext> i16:$src)), - (i32 (V_BFE_I32_e64 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10)))) + (i32 (V_BFE_I32_e64 i16:$src, (i32 0), (i32 0x10))) >; let isReMaterializable = 1 in { @@ -308,11 +308,11 @@ let FPDPRounding = 1 in { defm V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma>; } // End Predicates = [Has16BitInsts, isGFX8Only] - let renamedInGFX9 = 1, Predicates = [Has16BitInsts, isGFX9Plus] in { + let renamedInGFX9 = 1, SubtargetPredicate = isGFX9Plus in { defm V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup>; defm V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, any_fma>; - } // End renamedInGFX9 = 1, Predicates = [Has16BitInsts, isGFX9Plus] + } // End renamedInGFX9 = 1, SubtargetPredicate = isGFX9Plus } // End FPDPRounding = 1 let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in { @@ -381,36 +381,43 @@ def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>; def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>; } // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus] -let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in { +// Note: 16-bit instructions produce a 0 result in the high 16-bits +// on GFX8 and GFX9 and preserve high 16 bits on GFX10+ +multiclass Arithmetic_i16_0Hi_TernaryPats <SDPatternOperator op, Instruction inst> { + def : GCNPat< + (i32 (zext (op i16:$src0, i16:$src1, i16:$src2))), + (inst VSrc_b16:$src0, VSrc_b16:$src1, VSrc_b16:$src2) + >; +} -multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2, - Instruction inst> { -def : GCNPat < - (op2 (op1 i16:$src0, i16:$src1), i16:$src2), - (inst i16:$src0, i16:$src1, i16:$src2, (i1 0)) ->; +let Predicates = [Has16BitInsts, isGFX8GFX9] in { +defm : Arithmetic_i16_0Hi_TernaryPats<imad, V_MAD_U16_e64>; +} + +let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in { +// FIXME: Should be able to just pass imad to the instruction +// definition pattern, but the implied clamp input interferes. +multiclass Ternary_i16_Pats <SDPatternOperator op, Instruction inst> { + def : GCNPat < + (op i16:$src0, i16:$src1, i16:$src2), + (inst i16:$src0, i16:$src1, i16:$src2, (i1 0)) + >; } -defm: Ternary_i16_Pats<mul, add, V_MAD_U16_e64>; -defm: Ternary_i16_Pats<mul, add, V_MAD_I16_e64>; +defm: Ternary_i16_Pats<imad, V_MAD_U16_e64>; } // End Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] -let Predicates = [Has16BitInsts, isGFX10Plus] in { -multiclass Ternary_i16_Pats_gfx9<SDPatternOperator op1, SDPatternOperator op2, - Instruction inst> { -def : GCNPat < +class Ternary_i16_Pats_gfx9<SDPatternOperator op1, SDPatternOperator op2, + Instruction inst> : GCNPat < (op2 (op1 i16:$src0, i16:$src1), i16:$src2), (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE) >; -} - -defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64>; -defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9_e64>; - +let Predicates = [Has16BitInsts, isGFX10Plus] in { +def: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64>; } // End Predicates = [Has16BitInsts, isGFX10Plus] class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag< @@ -673,11 +680,19 @@ def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3 let HasExtDPP = 0; } +def opsel_i1timm : SDNodeXForm<timm, [{ + return CurDAG->getTargetConstant( + N->getZExtValue() ? SISrcMods::OP_SEL_0 : SISrcMods::NONE, + SDLoc(N), MVT::i32); +}]>; +def gi_opsel_i1timm : GICustomOperandRenderer<"renderOpSelTImm">, + GISDNodeXFormEquiv<opsel_i1timm>; + class PermlanePat<SDPatternOperator permlane, Instruction inst> : GCNPat< (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc), - (inst (as_i1timm $fi), VGPR_32:$src0, (as_i1timm $bc), + (inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc), SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in) >; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index da12515c817b..71e09611e74e 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -35,7 +35,7 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR, FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); dag dpp_srcs = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, - FPVRegInputMods:$src1_modifiers, VGPRSrc_32:$src1, + FPVRegInputMods:$src1_modifiers, VRegSrc_32:$src1, FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); // FIXME: clampmod0 misbehaves with the non-default vdst_in @@ -142,9 +142,34 @@ def : VOP3PSatPat<usubsat, V_PK_SUB_U16>; def : VOP3PSatPat<ssubsat, V_PK_SUB_I16>; } // End SubtargetPredicate = HasVOP3PInsts +// TODO: Make sure we're doing the right thing with denormals. Note +// that FMA and MAD will differ. multiclass MadFmaMixPats<SDPatternOperator fma_like, + Instruction mix_inst, Instruction mixlo_inst, Instruction mixhi_inst> { + // At least one of the operands needs to be an fpextend of an f16 + // for this to be worthwhile, so we need three patterns here. + // TODO: Could we use a predicate to inspect src1/2/3 instead? + def : GCNPat < + (f32 (fma_like (f32 (VOP3PMadMixModsExt f16:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_mods)))), + (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, + DSTCLAMP.NONE)>; + def : GCNPat < + (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsExt f16:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixMods f32:$src2, i32:$src2_mods)))), + (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, + DSTCLAMP.NONE)>; + def : GCNPat < + (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixModsExt f16:$src2, i32:$src2_mods)))), + (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, + DSTCLAMP.NONE)>; + def : GCNPat < (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), @@ -201,9 +226,29 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like, DSTCLAMP.ENABLE, (i32 (IMPLICIT_DEF))))) >; + + def : GCNPat < + (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))), + (mixlo_inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + (i32 0), (i32 0), + DSTCLAMP.NONE, + (i32 (IMPLICIT_DEF))) + >; + + def : GCNPat < + (build_vector f16:$elt0, (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))), + (v2f16 (mixhi_inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + (i32 0), (i32 0), + DSTCLAMP.NONE, + VGPR_32:$elt0)) + >; } -let SubtargetPredicate = HasMadMixInsts in { +let SubtargetPredicate = HasMadMixInsts, OtherPredicates = [NoFP32Denormals] in { // These are VOP3a-like opcodes which accept no omod. // Size of src arguments (16/32) is controlled by op_sel. @@ -222,8 +267,8 @@ defm V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3P_Mix_Profile<VOP_F } // End FPDPRounding = 1 } -defm : MadFmaMixPats<fmad, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>; -} // End SubtargetPredicate = HasMadMixInsts +defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>; +} // End SubtargetPredicate = HasMadMixInsts, OtherPredicates = [NoFP32Denormals] // Essentially the same as the mad_mix versions @@ -243,7 +288,7 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F } // End FPDPRounding = 1 } -defm : MadFmaMixPats<fma, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>; +defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>; } // Defines patterns that extract signed 4bit from each Idx[0]. @@ -337,11 +382,12 @@ defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", } // End SubtargetPredicate = HasDot2Insts -let SubtargetPredicate = HasDot7Insts in { - +let SubtargetPredicate = HasDot10Insts in defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR, /*HasDPP*/ 1>, AMDGPUfdot2, 1/*ExplicitClamp*/>; + +let SubtargetPredicate = HasDot7Insts in { defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>; defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", @@ -581,7 +627,7 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node, MFMATable<0, NAME # "_vgprcd_e64">; } - foreach _ = BoolToList<NoDstOverlap>.ret in { + if NoDstOverlap then { let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""), isConvertibleToThreeAddress = NoDstOverlap, Mnemonic = OpName in { @@ -989,7 +1035,7 @@ multiclass VOP3P_Real_MFMA_gfx940_aliases<string NameFrom, string NameTo, string VOPProfile Pfl_ACD = PS_ACD.Pfl, VOPProfile Pfl_VCD = PS_VCD.Pfl> { let Predicates = [isGFX940Plus] in { - foreach _ = BoolToList<!ne(NameFrom, NameTo)>.ret in { + if !ne(NameFrom, NameTo) then { def : InstAlias <NameTo # " " # PS_ACD.AsmOperands, (!cast<VOP3P_Real>(Op # "_gfx940_acd") Pfl_ACD.DstRC:$vdst, Pfl_ACD.Src0RC64:$src0, Pfl_ACD.Src1RC64:$src1, Pfl_ACD.Src2RC64:$src2, @@ -1017,7 +1063,7 @@ multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(N defm : VOP3P_Real_MFMA_gfx940_aliases<Name, PS_ACD.Mnemonic, NAME>; - foreach _ = BoolToList<!ne(!subst("_1k", "", PS_ACD.Mnemonic), PS_ACD.Mnemonic)>.ret in + if !ne(!subst("_1k", "", PS_ACD.Mnemonic), PS_ACD.Mnemonic) then defm : VOP3P_Real_MFMA_gfx940_aliases<Name, !subst("_1k", "", PS_ACD.Mnemonic), NAME>; } @@ -1081,28 +1127,16 @@ defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x22>; } -let SubtargetPredicate = HasDot2Insts in { - defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x26>; defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x27>; -} // End SubtargetPredicate = HasDot2Insts - -let SubtargetPredicate = HasDot7Insts in { - defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x23>; defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x29>; defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x2b>; -} // End SubtargetPredicate = HasDot7Insts - -let SubtargetPredicate = HasDot1Insts in { - defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x28>; defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x2a>; -} // End SubtargetPredicate = HasDot1Insts - let SubtargetPredicate = HasMAIInsts in { defm V_ACCVGPR_READ_B32 : VOP3P_Real_MAI <0x58>; @@ -1225,24 +1259,12 @@ defm V_FMA_MIX_F32 : VOP3P_Real_gfx10_gfx11_Triple <0x20>; defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x21>; defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x22>; -let SubtargetPredicate = HasDot2Insts in { - defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x14>; defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>; -} // End SubtargetPredicate = HasDot2Insts - -let SubtargetPredicate = HasDot7Insts in { - defm V_DOT2_F32_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x13>; defm V_DOT4_U32_U8 : VOP3P_Real_gfx10_gfx11 <0x17>; defm V_DOT8_U32_U4 : VOP3P_Real_gfx10_gfx11 <0x19>; -} // End SubtargetPredicate = HasDot7Insts - -let SubtargetPredicate = HasDot1Insts in { - defm V_DOT4_I32_I8 : VOP3P_Real_gfx10 <0x16>; defm V_DOT8_I32_I4 : VOP3P_Real_gfx10 <0x18>; - -} // End SubtargetPredicate = HasDot1Insts diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index 439ca40ae3fb..6fc3d0957dce 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -299,7 +299,7 @@ multiclass VOPC_Pseudos <string opName, let isCommutable = 1; } - foreach _ = BoolToList<P.HasExtSDWA>.ret in + if P.HasExtSDWA then def _sdwa : VOPC_SDWA_Pseudo <opName, P> { let Defs = !if(DefExec, [EXEC], []); let SchedRW = P.Schedule; @@ -360,7 +360,7 @@ multiclass VOPCX_Pseudos <string opName, let IsVCMPX = 1; } - foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in + if P_NoSDst.HasExtSDWA then def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> { let Defs = [EXEC]; let SchedRW = P_NoSDst.Schedule; @@ -770,7 +770,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType src0VT, ValueType // DPP8 forbids modifiers and can inherit from VOPC_Profile let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); - dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, VGPRSrc_32:$src1); + dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, VRegSrc_32:$src1); let InsVOP3Base = !con(InsPartVOP3DPP, !if(HasOpSel, (ins op_sel0:$op_sel), (ins))); let AsmVOP3Base = "$sdst, $src0_modifiers, $src1"; @@ -831,7 +831,7 @@ class getVOPCClassPat64 <VOPProfile P> { list<dag> ret = [(set i1:$sdst, (AMDGPUfp_class - (P.Src0VT (VOP3Mods P.Src0VT:$src0, i32:$src0_modifiers)), + (P.Src0VT (VOP3ModsNonCanonicalizing P.Src0VT:$src0, i32:$src0_modifiers)), i32:$src1))]; } @@ -854,7 +854,7 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec, let SchedRW = p.Schedule; } - foreach _ = BoolToList<p.HasExtSDWA>.ret in + if p.HasExtSDWA then def _sdwa : VOPC_SDWA_Pseudo <opName, p> { let Defs = !if(DefExec, !if(DefVcc, [VCC, EXEC], [EXEC]), !if(DefVcc, [VCC], [])); @@ -902,7 +902,7 @@ multiclass VOPCX_Class_Pseudos <string opName, let SubtargetPredicate = HasNoSdstCMPX; } - foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in + if P_NoSDst.HasExtSDWA then def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> { let Defs = [EXEC]; let SchedRW = P_NoSDst.Schedule; @@ -992,11 +992,18 @@ multiclass ICMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> { (i64 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_64)) >; - let WaveSizePredicate = isWave32 in - def : GCNPat < - (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), - (i32 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_32)) - >; + let WaveSizePredicate = isWave32 in { + def : GCNPat < + (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), + (i32 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_32)) + >; + + // Support codegen of i64 setcc in wave32 mode. + def : GCNPat < + (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), + (i64 (REG_SEQUENCE SReg_64, (inst $src0, $src1), sub0, (S_MOV_B32 (i32 0)), sub1)) + >; + } } defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U32_e64, i32>; @@ -1056,13 +1063,22 @@ multiclass FCMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> { DSTCLAMP.NONE), SReg_64)) >; - let WaveSizePredicate = isWave32 in - def : GCNPat < - (i32 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), - (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), - (i32 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1, - DSTCLAMP.NONE), SReg_32)) - >; + let WaveSizePredicate = isWave32 in { + def : GCNPat < + (i32 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), + (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), + (i32 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1, + DSTCLAMP.NONE), SReg_32)) + >; + + def : GCNPat < + (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), + (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), + (i64 (REG_SEQUENCE SReg_64, (inst $src0_modifiers, $src0, $src1_modifiers, $src1, + DSTCLAMP.NONE), sub0, + (S_MOV_B32 (i32 0)), sub1)) + >; + } } defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>; @@ -1320,7 +1336,7 @@ let AssemblerPredicate = isGFX11Only in { defm : VOPCInstAliases<NAME, "gfx11">; - foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in { + if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e32" #"_dpp"); defvar AsmDPP = ps32.Pfl.AsmDPP16; let DecoderNamespace = "DPPGFX11" in { @@ -1352,7 +1368,7 @@ let AssemblerPredicate = isGFX11Only in { } } } - foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in { + if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e64" #"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; let DecoderNamespace = "DPPGFX11" in { @@ -1419,7 +1435,7 @@ let AssemblerPredicate = isGFX11Only in { defm : VOPCInstAliases<OpName, "gfx11", NAME, asm_name>; - foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in { + if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e32" #"_dpp"); defvar AsmDPP = ps32.Pfl.AsmDPP16; let DecoderNamespace = "DPPGFX11" in { @@ -1456,7 +1472,7 @@ let AssemblerPredicate = isGFX11Only in { } } - foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in { + if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e64" #"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; let DecoderNamespace = "DPPGFX11" in { @@ -1518,7 +1534,7 @@ let AssemblerPredicate = isGFX11Only in { defm : VOPCXInstAliases<NAME, "gfx11">; - foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in { + if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e32" #"_dpp"); defvar AsmDPP = ps32.Pfl.AsmDPP16; let DecoderNamespace = "DPPGFX11" in { @@ -1535,7 +1551,7 @@ let AssemblerPredicate = isGFX11Only in { } } - foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in { + if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e64" #"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; let DecoderNamespace = "DPPGFX11" in { @@ -1584,7 +1600,7 @@ let AssemblerPredicate = isGFX11Only in { defm : VOPCXInstAliases<OpName, "gfx11", NAME, asm_name>; - foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in { + if ps32.Pfl.HasExtDPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e32"#"_dpp"); let DecoderNamespace = "DPPGFX11" in { def _e32_dpp_gfx11 : VOPC_DPP16_SIMC<op{7-0}, psDPP, @@ -1594,7 +1610,7 @@ let AssemblerPredicate = isGFX11Only in { def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32, asm_name>; } } - foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in { + if ps64.Pfl.HasExtVOP3DPP then { defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e64"#"_dpp"); defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16; let DecoderNamespace = "DPPGFX11" in { @@ -1821,7 +1837,7 @@ let AssemblerPredicate = isGFX10Only in { } } // End DecoderNamespace = "GFX10" - foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in + if !cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx10 : VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>, VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; @@ -1847,7 +1863,7 @@ let AssemblerPredicate = isGFX10Only in { } } // End DecoderNamespace = "GFX10" - foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32").Pfl.HasExtSDWA9>.ret in + if !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx10 : VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa")>, VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa").Pfl> { @@ -2174,12 +2190,12 @@ multiclass VOPC_Real_vi <bits<10> op> { } } - foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in + if !cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA then def _sdwa_vi : VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>, VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; - foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in + if !cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then def _sdwa_gfx9 : VOP_SDWA9_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>, VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index d5c662ac0574..3755daf4f9b1 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -1268,7 +1268,7 @@ class VOP3InstBase<string OpName, VOPProfile P, SDPatternOperator node = null_fr multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> { def _e64 : VOP3InstBase<OpName, P, node>; let SubtargetPredicate = isGFX11Plus in { - foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in + if P.HasExtVOP3DPP then def _e64_dpp : VOP3_DPP_Pseudo <OpName, P>; } // end SubtargetPredicate = isGFX11Plus } @@ -1329,11 +1329,11 @@ let AssemblerPredicate = isGFX11Only, bit isSingle = 0> { defvar ps = !cast<VOP_Pseudo>(opName#"_e64"); let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in { - foreach _ = BoolToList<ps.Pfl.HasOpSel>.ret in + if ps.Pfl.HasOpSel then def _e64_gfx11 : VOP3_Real<ps, SIEncodingFamily.GFX11>, VOP3OpSel_gfx11<op, ps.Pfl>; - foreach _ = BoolToList<!not(ps.Pfl.HasOpSel)>.ret in + if !not(ps.Pfl.HasOpSel) then def _e64_gfx11 : VOP3_Real<ps, SIEncodingFamily.GFX11>, VOP3e_gfx11<op, ps.Pfl>; @@ -1353,11 +1353,11 @@ let AssemblerPredicate = isGFX11Only, defvar ps = !cast<VOP_Pseudo>(opName#"_e64"); let AsmString = asmName # ps.AsmOperands, IsSingle = !or(isSingle, ps.Pfl.IsSingle) in { - foreach _ = BoolToList<ps.Pfl.HasOpSel>.ret in + if ps.Pfl.HasOpSel then def _e64_gfx11 : VOP3_Real<ps, SIEncodingFamily.GFX11>, VOP3OpSel_gfx11<op, ps.Pfl>; - foreach _ = BoolToList<!not(ps.Pfl.HasOpSel)>.ret in + if !not(ps.Pfl.HasOpSel) then def _e64_gfx11 : VOP3_Real<ps, SIEncodingFamily.GFX11>, VOP3e_gfx11<op, ps.Pfl>; @@ -1487,7 +1487,7 @@ include "VOP3PInstructions.td" include "VOPDInstructions.td" class ClassPat<Instruction inst, ValueType vt> : GCNPat < - (is_fpclass (vt (VOP3Mods vt:$src0, i32:$src0_mods)), (i32 timm:$mask)), + (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)), (inst i32:$src0_mods, vt:$src0, (V_MOV_B32_e32 timm:$mask)) >; |
