diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2018-07-28 10:51:19 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2018-07-28 10:51:19 +0000 |
commit | eb11fae6d08f479c0799db45860a98af528fa6e7 (patch) | |
tree | 44d492a50c8c1a7eb8e2d17ea3360ec4d066f042 /lib/Target/AMDGPU | |
parent | b8a2042aa938069e862750553db0e4d82d25822c (diff) |
Notes
Diffstat (limited to 'lib/Target/AMDGPU')
183 files changed, 15348 insertions, 7244 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 0ddc43ad5033..796766d94622 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -11,7 +11,6 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/Target/TargetMachine.h" namespace llvm { @@ -50,9 +49,9 @@ FunctionPass *createSIOptimizeExecMaskingPreRAPass(); FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIDebuggerInsertNopsPass(); -FunctionPass *createSIInsertWaitsPass(); FunctionPass *createSIInsertWaitcntsPass(); FunctionPass *createSIFixWWMLivenessPass(); +FunctionPass *createSIFormMemoryClausesPass(); FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &); FunctionPass *createAMDGPUUseNativeCallsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(); @@ -74,6 +73,14 @@ ModulePass *createAMDGPULowerIntrinsicsPass(); void initializeAMDGPULowerIntrinsicsPass(PassRegistry &); extern char &AMDGPULowerIntrinsicsID; +FunctionPass *createAMDGPULowerKernelArgumentsPass(); +void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &); +extern char &AMDGPULowerKernelArgumentsID; + +ModulePass *createAMDGPULowerKernelAttributesPass(); +void initializeAMDGPULowerKernelAttributesPass(PassRegistry &); +extern char &AMDGPULowerKernelAttributesID; + void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &); extern char &AMDGPURewriteOutArgumentsID; @@ -134,6 +141,9 @@ extern char &AMDGPUSimplifyLibCallsID; void initializeAMDGPUUseNativeCallsPass(PassRegistry &); extern char &AMDGPUUseNativeCallsID; +void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &); +extern char &AMDGPUPerfHintAnalysisID; + // Passes common to R600 and SI FunctionPass *createAMDGPUPromoteAlloca(); void initializeAMDGPUPromoteAllocaPass(PassRegistry&); @@ -144,7 +154,7 @@ FunctionPass *createAMDGPUISelDag( TargetMachine *TM = nullptr, CodeGenOpt::Level OptLevel = CodeGenOpt::Default); ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true); -ModulePass *createAMDGPUOpenCLImageTypeLoweringPass(); +ModulePass *createR600OpenCLImageTypeLoweringPass(); FunctionPass *createAMDGPUAnnotateUniformValues(); ModulePass* createAMDGPUUnifyMetadataPass(); @@ -169,12 +179,12 @@ extern char &SIMemoryLegalizerID; void initializeSIDebuggerInsertNopsPass(PassRegistry&); extern char &SIDebuggerInsertNopsID; -void initializeSIInsertWaitsPass(PassRegistry&); -extern char &SIInsertWaitsID; - void initializeSIInsertWaitcntsPass(PassRegistry&); extern char &SIInsertWaitcntsID; +void initializeSIFormMemoryClausesPass(PassRegistry&); +extern char &SIFormMemoryClausesID; + void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&); extern char &AMDGPUUnifyDivergentExitNodesID; @@ -222,8 +232,11 @@ struct AMDGPUAS { MAX_COMMON_ADDRESS = 5, GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). - CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2) + CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2) LOCAL_ADDRESS = 3, ///< Address space for local memory. + + CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory + /// Address space for direct addressible parameter memory (CONST0) PARAM_D_ADDRESS = 6, /// Address space for indirect addressible parameter memory (VTX1) diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index c02d0a131041..16c2a366db28 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -7,58 +7,30 @@ // //===------------------------------------------------------------===// +include "llvm/TableGen/SearchableTable.td" include "llvm/Target/Target.td" +include "AMDGPUFeatures.td" //===------------------------------------------------------------===// // Subtarget Features (device properties) //===------------------------------------------------------------===// -def FeatureFP64 : SubtargetFeature<"fp64", - "FP64", - "true", - "Enable double precision operations" ->; - -def FeatureFMA : SubtargetFeature<"fmaf", - "FMA", - "true", - "Enable single precision FMA (not as fast as mul+add, but fused)" ->; - def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf", "FastFMAF32", "true", "Assuming f32 fma is at least as fast as mul + add" >; -def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops", - "HalfRate64Ops", - "true", - "Most fp64 instructions are half rate instead of quarter" ->; - -def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", - "R600ALUInst", - "false", - "Older version of ALU instructions encoding" ->; - -def FeatureVertexCache : SubtargetFeature<"HasVertexCache", - "HasVertexCache", +def FeatureMIMG_R128 : SubtargetFeature<"mimg-r128", + "MIMG_R128", "true", - "Specify use of dedicated vertex cache" + "Support 128-bit texture resources" >; -def FeatureCaymanISA : SubtargetFeature<"caymanISA", - "CaymanISA", - "true", - "Use Cayman ISA" ->; - -def FeatureCFALUBug : SubtargetFeature<"cfalubug", - "CFALUBug", +def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops", + "HalfRate64Ops", "true", - "GPU has CF_ALU bug" + "Most fp64 instructions are half rate instead of quarter" >; def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", @@ -121,6 +93,12 @@ def FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts", "Has v_mad_mix_f32, v_mad_mixlo_f16, v_mad_mixhi_f16 instructions" >; +def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts", + "HasFmaMixInsts", + "true", + "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions" +>; + // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support // XNACK. The current default kernel driver setting is: // - graphics ring: XNACK disabled @@ -140,27 +118,6 @@ def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", "VI SGPR initialization bug requiring a fixed SGPR allocation size" >; -class SubtargetFeatureFetchLimit <string Value> : - SubtargetFeature <"fetch"#Value, - "TexVTXClauseSize", - Value, - "Limit the maximum number of fetches in a clause to "#Value ->; - -def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">; -def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">; - -class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature< - "wavefrontsize"#Value, - "WavefrontSize", - !cast<string>(Value), - "The number of threads per wavefront" ->; - -def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; -def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; -def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; - class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature < "ldsbankcount"#Value, "LDSBankCount", @@ -171,19 +128,6 @@ class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature < def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>; def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>; -class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature< - "localmemorysize"#Value, - "LocalMemorySize", - !cast<string>(Value), - "The size of local memory in bytes" ->; - -def FeatureGCN : SubtargetFeature<"gcn", - "IsGCN", - "true", - "GCN or newer GPU" ->; - def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding", "GCN3Encoding", "true", @@ -244,6 +188,12 @@ def FeatureScalarStores : SubtargetFeature<"scalar-stores", "Has store scalar memory instructions" >; +def FeatureScalarAtomics : SubtargetFeature<"scalar-atomics", + "HasScalarAtomics", + "true", + "Has atomic scalar memory instructions" +>; + def FeatureSDWA : SubtargetFeature<"sdwa", "HasSDWA", "true", @@ -292,6 +242,27 @@ def FeatureIntClamp : SubtargetFeature<"int-clamp-insts", "Support clamp for integer destination" >; +def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem", + "HasUnpackedD16VMem", + "true", + "Has unpacked d16 vmem instructions" +>; + +def FeatureDLInsts : SubtargetFeature<"dl-insts", + "HasDLInsts", + "true", + "Has deep learning instructions" +>; + +def FeatureD16PreservesUnusedBits : SubtargetFeature< + "d16-preserves-unused-bits", + "D16PreservesUnusedBits", + "true", + "If present, then instructions defined by HasD16LoadStore predicate preserve " + "unused bits. Otherwise instructions defined by HasD16LoadStore predicate " + "zero unused bits." +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -329,12 +300,6 @@ def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals", [FeatureFP64FP16Denormals] >; -def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp", - "DX10Clamp", - "true", - "clamp modifier clamps NaNs to 0.0" ->; - def FeatureFPExceptions : SubtargetFeature<"fp-exceptions", "FPExceptions", "true", @@ -377,12 +342,6 @@ def FeatureDumpCodeLower : SubtargetFeature <"dumpcode", "Dump MachineInstrs in the CodeEmitter" >; -def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca", - "EnablePromoteAlloca", - "true", - "Enable promote alloca pass" ->; - // XXX - This should probably be removed once enabled by default def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt", "EnableLoadStoreOpt", @@ -408,6 +367,12 @@ def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler", "Enable SI Machine Scheduler" >; +def FeatureEnableDS128 : SubtargetFeature<"enable-ds128", + "EnableDS128", + "true", + "Use ds_{read|write}_b128" +>; + // Unless +-flat-for-global is specified, turn on FlatForGlobal for // all OS-es on VI and newer hardware to avoid assertion failures due // to missing ADDR64 variants of MUBUF instructions. @@ -440,46 +405,30 @@ def FeatureDisable : SubtargetFeature<"", "Dummy feature to disable assembler instructions" >; -class SubtargetFeatureGeneration <string Value, - list<SubtargetFeature> Implies> : - SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value, - Value#" GPU generation", Implies>; - -def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>; -def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>; -def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>; - -def FeatureR600 : SubtargetFeatureGeneration<"R600", - [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0] ->; - -def FeatureR700 : SubtargetFeatureGeneration<"R700", - [FeatureFetchLimit16, FeatureLocalMemorySize0] ->; - -def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN", - [FeatureFetchLimit16, FeatureLocalMemorySize32768] +def FeatureGCN : SubtargetFeature<"gcn", + "IsGCN", + "true", + "GCN or newer GPU" >; -def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", - [FeatureFetchLimit16, FeatureWavefrontSize64, - FeatureLocalMemorySize32768] ->; +class GCNSubtargetFeatureGeneration <string Value, + list<SubtargetFeature> Implies> : + SubtargetFeatureGeneration <Value, "GCNSubtarget", Implies>; -def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", - [FeatureFP64, FeatureLocalMemorySize32768, +def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", + [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureGCN, FeatureLDSBankCount32, FeatureMovrel] >; -def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", - [FeatureFP64, FeatureLocalMemorySize65536, +def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", + [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace, FeatureCIInsts, FeatureMovrel] >; -def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", - [FeatureFP64, FeatureLocalMemorySize65536, +def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", + [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel, @@ -489,7 +438,7 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", ] >; -def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9", +def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", [FeatureFP64, FeatureLocalMemorySize65536, FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, @@ -498,7 +447,7 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9", FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, - FeatureAddNoCarryInsts + FeatureAddNoCarryInsts, FeatureScalarAtomics ] >; @@ -534,7 +483,8 @@ def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1, def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2, [FeatureSeaIslands, - FeatureLDSBankCount16]>; + FeatureLDSBankCount16, + FeatureFastFMAF32]>; def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3, [FeatureSeaIslands, @@ -544,26 +494,24 @@ def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4, [FeatureSeaIslands, FeatureLDSBankCount32]>; -def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0, - [FeatureVolcanicIslands, - FeatureLDSBankCount32, - FeatureSGPRInitBug]>; - def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1, [FeatureVolcanicIslands, FeatureFastFMAF32, HalfRate64Ops, FeatureLDSBankCount32, - FeatureXNACK]>; + FeatureXNACK, + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2, [FeatureVolcanicIslands, FeatureLDSBankCount32, - FeatureSGPRInitBug]>; + FeatureSGPRInitBug, + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3, [FeatureVolcanicIslands, - FeatureLDSBankCount32]>; + FeatureLDSBankCount32, + FeatureUnpackedD16VMem]>; def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0, [FeatureVolcanicIslands, @@ -573,14 +521,28 @@ def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0, def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0, [FeatureGFX9, FeatureMadMixInsts, - FeatureLDSBankCount32 - ]>; + FeatureLDSBankCount32, + FeatureD16PreservesUnusedBits]>; def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2, [FeatureGFX9, FeatureMadMixInsts, - FeatureLDSBankCount32 - ]>; + FeatureLDSBankCount32, + FeatureXNACK, + FeatureD16PreservesUnusedBits]>; + +def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4, + [FeatureGFX9, + FeatureLDSBankCount32, + FeatureFmaMixInsts, + FeatureD16PreservesUnusedBits]>; + +def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6, + [FeatureGFX9, + HalfRate64Ops, + FeatureFmaMixInsts, + FeatureLDSBankCount32, + FeatureDLInsts]>; //===----------------------------------------------------------------------===// // Debugger related subtarget features. @@ -593,13 +555,6 @@ def FeatureDebuggerInsertNops : SubtargetFeature< "Insert one nop instruction for each high level source statement" >; -def FeatureDebuggerReserveRegs : SubtargetFeature< - "amdgpu-debugger-reserve-regs", - "DebuggerReserveRegs", - "true", - "Reserve registers for debugger usage" ->; - def FeatureDebuggerEmitPrologue : SubtargetFeature< "amdgpu-debugger-emit-prologue", "DebuggerEmitPrologue", @@ -675,6 +630,7 @@ def AMDGPU : Target { SDWA9AsmParserVariant, DPPAsmParserVariant]; let AssemblyWriters = [AMDGPUAsmWriter]; + let AllowRegisterRenaming = 1; } // Dummy Instruction itineraries for pseudo instructions @@ -685,8 +641,6 @@ def NullALU : InstrItinClass; // Predicate helper class //===----------------------------------------------------------------------===// -def TruePredicate : Predicate<"true">; - def isSICI : Predicate< "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" @@ -715,6 +669,13 @@ def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">, def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">, AssemblerPredicate<"FeatureGFX9Insts">; +def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">, + AssemblerPredicate<"FeatureUnpackedD16VMem">; +def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">, + AssemblerPredicate<"!FeatureUnpackedD16VMem">; + +def D16PreservesUnusedBits : Predicate<"Subtarget->d16PreservesUnusedBits()">, + AssemblerPredicate<"FeatureD16PreservesUnusedBits">; def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">; def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">; @@ -733,6 +694,9 @@ def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">, def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">, AssemblerPredicate<"FeatureVOP3P">; +def NotHasVOP3PInsts : Predicate<"!Subtarget->hasVOP3PInsts()">, + AssemblerPredicate<"!FeatureVOP3P">; + def HasSDWA : Predicate<"Subtarget->hasSDWA()">, AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">; @@ -748,38 +712,35 @@ def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">, def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">, AssemblerPredicate<"FeatureMadMixInsts">; -def EnableLateCFGStructurize : Predicate< - "EnableLateStructurizeCFG">; +def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">, + AssemblerPredicate<"FeatureScalarAtomics">; -// Exists to help track down where SubtargetPredicate isn't set rather -// than letting tablegen crash with an unhelpful error. -def InvalidPred : Predicate<"predicate not set on instruction or pattern">; - -class PredicateControl { - Predicate SubtargetPredicate = InvalidPred; - Predicate SIAssemblerPredicate = isSICI; - Predicate VIAssemblerPredicate = isVI; - list<Predicate> AssemblerPredicates = []; - Predicate AssemblerPredicate = TruePredicate; - list<Predicate> OtherPredicates = []; - list<Predicate> Predicates = !listconcat([SubtargetPredicate, - AssemblerPredicate], - AssemblerPredicates, - OtherPredicates); -} +def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; +def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; +def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">, + AssemblerPredicate<"FeatureVGPRIndexMode">; +def HasMovrel : Predicate<"Subtarget->hasMovrel()">, + AssemblerPredicate<"FeatureMovrel">; + +def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">, + AssemblerPredicate<"FeatureFmaMixInsts">; -class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>, - PredicateControl; +def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">, + AssemblerPredicate<"FeatureDLInsts">; +def EnableLateCFGStructurize : Predicate< + "EnableLateStructurizeCFG">; + // Include AMDGPU TD files -include "R600Schedule.td" -include "R600Processors.td" include "SISchedule.td" include "GCNProcessors.td" include "AMDGPUInstrInfo.td" include "AMDGPUIntrinsics.td" +include "SIIntrinsics.td" include "AMDGPURegisterInfo.td" include "AMDGPURegisterBanks.td" include "AMDGPUInstructions.td" +include "SIInstrInfo.td" include "AMDGPUCallingConv.td" +include "AMDGPUSearchableTables.td" diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp index 392b011e387c..ef4b69d09d9f 100644 --- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp +++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -61,7 +61,7 @@ AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Ar /* Region */ {NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, MayAlias} }; static const AliasResult ASAliasRulesGenIsZero[6][6] = { - /* Flat Global Constant Group Region Private */ + /* Flat Global Region Group Constant Private */ /* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias}, /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , NoAlias , NoAlias}, /* Constant */ {MayAlias, NoAlias , MayAlias, NoAlias , NoAlias, NoAlias}, @@ -72,9 +72,9 @@ AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Ar assert(AS.MAX_COMMON_ADDRESS <= 5); if (AS.FLAT_ADDRESS == 0) { assert(AS.GLOBAL_ADDRESS == 1 && - AS.REGION_ADDRESS == 4 && + AS.REGION_ADDRESS == 2 && AS.LOCAL_ADDRESS == 3 && - AS.CONSTANT_ADDRESS == 2 && + AS.CONSTANT_ADDRESS == 4 && AS.PRIVATE_ADDRESS == 5); ASAliasRules = &ASAliasRulesGenIsZero; } else { @@ -115,7 +115,8 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal) { const Value *Base = GetUnderlyingObject(Loc.Ptr, DL); - if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS) { + if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS || + Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS_32BIT) { return true; } diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index c27425443abc..d4bbb2c1eb8d 100644 --- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -14,6 +14,9 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/IR/Module.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -30,13 +33,18 @@ static cl::opt<bool> StressCalls( class AMDGPUAlwaysInline : public ModulePass { bool GlobalOpt; + void recursivelyVisitUsers(GlobalValue &GV, + SmallPtrSetImpl<Function *> &FuncsToAlwaysInline); public: static char ID; AMDGPUAlwaysInline(bool GlobalOpt = false) : ModulePass(ID), GlobalOpt(GlobalOpt) { } bool runOnModule(Module &M) override; - StringRef getPassName() const override { return "AMDGPU Always Inline Pass"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } }; } // End anonymous namespace @@ -46,15 +54,53 @@ INITIALIZE_PASS(AMDGPUAlwaysInline, "amdgpu-always-inline", char AMDGPUAlwaysInline::ID = 0; +void AMDGPUAlwaysInline::recursivelyVisitUsers( + GlobalValue &GV, + SmallPtrSetImpl<Function *> &FuncsToAlwaysInline) { + SmallVector<User *, 16> Stack; + + SmallPtrSet<const Value *, 8> Visited; + + for (User *U : GV.users()) + Stack.push_back(U); + + while (!Stack.empty()) { + User *U = Stack.pop_back_val(); + if (!Visited.insert(U).second) + continue; + + if (Instruction *I = dyn_cast<Instruction>(U)) { + Function *F = I->getParent()->getParent(); + if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) { + FuncsToAlwaysInline.insert(F); + Stack.push_back(F); + } + + // No need to look at further users, but we do need to inline any callers. + continue; + } + + for (User *UU : U->users()) + Stack.push_back(UU); + } +} + bool AMDGPUAlwaysInline::runOnModule(Module &M) { + AMDGPUAS AMDGPUAS = AMDGPU::getAMDGPUAS(M); + std::vector<GlobalAlias*> AliasesToRemove; - std::vector<Function *> FuncsToClone; + + SmallPtrSet<Function *, 8> FuncsToAlwaysInline; + SmallPtrSet<Function *, 8> FuncsToNoInline; for (GlobalAlias &A : M.aliases()) { if (Function* F = dyn_cast<Function>(A.getAliasee())) { A.replaceAllUsesWith(F); AliasesToRemove.push_back(&A); } + + // FIXME: If the aliasee isn't a function, it's some kind of constant expr + // cast that won't be inlined through. } if (GlobalOpt) { @@ -63,31 +109,51 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) { } } - auto NewAttr = StressCalls ? Attribute::NoInline : Attribute::AlwaysInline; - auto IncompatAttr - = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline; - - for (Function &F : M) { - if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() && - !F.hasFnAttribute(IncompatAttr)) - FuncsToClone.push_back(&F); - } - - for (Function *F : FuncsToClone) { - ValueToValueMapTy VMap; - Function *NewFunc = CloneFunction(F, VMap); - NewFunc->setLinkage(GlobalValue::InternalLinkage); - F->replaceAllUsesWith(NewFunc); + // Always force inlining of any function that uses an LDS global address. This + // is something of a workaround because we don't have a way of supporting LDS + // objects defined in functions. LDS is always allocated by a kernel, and it + // is difficult to manage LDS usage if a function may be used by multiple + // kernels. + // + // OpenCL doesn't allow declaring LDS in non-kernels, so in practice this + // should only appear when IPO passes manages to move LDs defined in a kernel + // into a single user function. + + for (GlobalVariable &GV : M.globals()) { + // TODO: Region address + unsigned AS = GV.getType()->getAddressSpace(); + if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS.REGION_ADDRESS) + continue; + + recursivelyVisitUsers(GV, FuncsToAlwaysInline); } - for (Function &F : M) { - if (F.hasLocalLinkage() && !F.hasFnAttribute(IncompatAttr)) { - F.addFnAttr(NewAttr); + if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) { + auto IncompatAttr + = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline; + + for (Function &F : M) { + if (!F.isDeclaration() && !F.use_empty() && + !F.hasFnAttribute(IncompatAttr)) { + if (StressCalls) { + if (!FuncsToAlwaysInline.count(&F)) + FuncsToNoInline.insert(&F); + } else + FuncsToAlwaysInline.insert(&F); + } } } - return false; + + for (Function *F : FuncsToAlwaysInline) + F->addFnAttr(Attribute::AlwaysInline); + + for (Function *F : FuncsToNoInline) + F->addFnAttr(Attribute::NoInline); + + return !FuncsToAlwaysInline.empty() || !FuncsToNoInline.empty(); } ModulePass *llvm::createAMDGPUAlwaysInlinePass(bool GlobalOpt) { return new AMDGPUAlwaysInline(GlobalOpt); } + diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index ce17202f3414..1a70833a4472 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -219,7 +219,7 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee, } bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); + const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); bool HasFlat = ST.hasFlatAddressSpace(); bool HasApertureRegs = ST.hasApertureRegs(); SmallPtrSet<const Constant *, 8> ConstantExprVisited; diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp index dcca3a2fab96..7465cf22b5a4 100644 --- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -55,9 +55,6 @@ void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const { << " DispatchID: " << FI.second.DispatchID << " FlatScratchInit: " << FI.second.FlatScratchInit << " PrivateSegmentSize: " << FI.second.PrivateSegmentSize - << " GridWorkgroupCountX: " << FI.second.GridWorkGroupCountX - << " GridWorkgroupCountY: " << FI.second.GridWorkGroupCountY - << " GridWorkgroupCountZ: " << FI.second.GridWorkGroupCountZ << " WorkGroupIDX: " << FI.second.WorkGroupIDX << " WorkGroupIDY: " << FI.second.WorkGroupIDY << " WorkGroupIDZ: " << FI.second.WorkGroupIDZ diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index bf9635549a8c..f0e6d1b83f15 100644 --- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -18,7 +18,7 @@ namespace llvm { class Function; class raw_ostream; -class SISubtarget; +class GCNSubtarget; class TargetMachine; class TargetRegisterClass; class TargetRegisterInfo; @@ -111,9 +111,6 @@ struct AMDGPUFunctionArgInfo { ArgDescriptor DispatchID; ArgDescriptor FlatScratchInit; ArgDescriptor PrivateSegmentSize; - ArgDescriptor GridWorkGroupCountX; - ArgDescriptor GridWorkGroupCountY; - ArgDescriptor GridWorkGroupCountZ; // System SGPRs in kernels. ArgDescriptor WorkGroupIDX; diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index fda6252f46e3..e62e5d52ad74 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1,4 +1,4 @@ -//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===// +//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer -------------------===// // // The LLVM Compiler Infrastructure // @@ -21,7 +21,9 @@ #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "InstPrinter/AMDGPUInstPrinter.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "MCTargetDesc/AMDGPUTargetStreamer.h" +#include "R600AsmPrinter.h" #include "R600Defines.h" #include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" @@ -32,7 +34,6 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/TargetLoweringObjectFile.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" @@ -40,6 +41,7 @@ #include "llvm/Support/AMDGPUMetadata.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetLoweringObjectFile.h" using namespace llvm; using namespace llvm::AMDGPU; @@ -65,7 +67,7 @@ using namespace llvm::AMDGPU; // instructions to run at the double precision rate for the device so it's // probably best to just report no single precision denormals. static uint32_t getFPMode(const MachineFunction &F) { - const SISubtarget& ST = F.getSubtarget<SISubtarget>(); + const GCNSubtarget& ST = F.getSubtarget<GCNSubtarget>(); // TODO: Is there any real use for the flush in only / flush out only modes? uint32_t FP32Denormals = @@ -88,7 +90,7 @@ createAMDGPUAsmPrinterPass(TargetMachine &tm, extern "C" void LLVMInitializeAMDGPUAsmPrinter() { TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(), - createAMDGPUAsmPrinterPass); + llvm::createR600AsmPrinterPass); TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(), createAMDGPUAsmPrinterPass); } @@ -114,7 +116,8 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { } void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { - if (TM.getTargetTriple().getArch() != Triple::amdgcn) + if (IsaInfo::hasCodeObjectV3(getSTI()) && + TM.getTargetTriple().getOS() == Triple::AMDHSA) return; if (TM.getTargetTriple().getOS() != Triple::AMDHSA && @@ -127,10 +130,6 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { if (TM.getTargetTriple().getOS() == Triple::AMDPAL) readPALMetadata(M); - // Deprecated notes are not emitted for code object v3. - if (IsaInfo::hasCodeObjectV3(getSTI()->getFeatureBits())) - return; - // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2. if (TM.getTargetTriple().getOS() == Triple::AMDHSA) getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1); @@ -142,7 +141,9 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { } void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { - if (TM.getTargetTriple().getArch() != Triple::amdgcn) + // TODO: Add metadata to code object v3. + if (IsaInfo::hasCodeObjectV3(getSTI()) && + TM.getTargetTriple().getOS() == Triple::AMDHSA) return; // Following code requires TargetStreamer to be present. @@ -189,37 +190,82 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( } void AMDGPUAsmPrinter::EmitFunctionBodyStart() { - const AMDGPUMachineFunction *MFI = MF->getInfo<AMDGPUMachineFunction>(); - if (!MFI->isEntryFunction()) + const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); + if (!MFI.isEntryFunction()) + return; + if (IsaInfo::hasCodeObjectV3(getSTI()) && + TM.getTargetTriple().getOS() == Triple::AMDHSA) return; - const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); - amd_kernel_code_t KernelCode; - if (STM.isAmdCodeObjectV2(*MF)) { + const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); + const Function &F = MF->getFunction(); + if (STM.isAmdCodeObjectV2(F) && + (F.getCallingConv() == CallingConv::AMDGPU_KERNEL || + F.getCallingConv() == CallingConv::SPIR_KERNEL)) { + amd_kernel_code_t KernelCode; getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF); - - OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); getTargetStreamer()->EmitAMDKernelCodeT(KernelCode); } if (TM.getTargetTriple().getOS() != Triple::AMDHSA) return; - HSAMetadataStream.emitKernel(MF->getFunction(), - getHSACodeProps(*MF, CurrentProgramInfo), - getHSADebugProps(*MF, CurrentProgramInfo)); + HSAMetadataStream.emitKernel(*MF, CurrentProgramInfo); +} + +void AMDGPUAsmPrinter::EmitFunctionBodyEnd() { + const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); + if (!MFI.isEntryFunction()) + return; + if (!IsaInfo::hasCodeObjectV3(getSTI()) || + TM.getTargetTriple().getOS() != Triple::AMDHSA) + return; + + auto &Streamer = getTargetStreamer()->getStreamer(); + auto &Context = Streamer.getContext(); + auto &ObjectFileInfo = *Context.getObjectFileInfo(); + auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection(); + + Streamer.PushSection(); + Streamer.SwitchSection(&ReadOnlySection); + + // CP microcode requires the kernel descriptor to be allocated on 64 byte + // alignment. + Streamer.EmitValueToAlignment(64, 0, 1, 0); + if (ReadOnlySection.getAlignment() < 64) + ReadOnlySection.setAlignment(64); + + SmallString<128> KernelName; + getNameWithPrefix(KernelName, &MF->getFunction()); + getTargetStreamer()->EmitAmdhsaKernelDescriptor( + *getSTI(), KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), + CurrentProgramInfo.NumVGPRsForWavesPerEU, + CurrentProgramInfo.NumSGPRsForWavesPerEU - + IsaInfo::getNumExtraSGPRs(getSTI()->getFeatureBits(), + CurrentProgramInfo.VCCUsed, + CurrentProgramInfo.FlatUsed), + CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, + hasXNACK(*getSTI())); + + Streamer.PopSection(); } void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { + if (IsaInfo::hasCodeObjectV3(getSTI()) && + TM.getTargetTriple().getOS() == Triple::AMDHSA) { + AsmPrinter::EmitFunctionEntryLabel(); + return; + } + const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); - if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(*MF)) { + const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); + if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(MF->getFunction())) { SmallString<128> SymbolName; getNameWithPrefix(SymbolName, &MF->getFunction()), getTargetStreamer()->EmitAMDGPUSymbolType( SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); } - const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>(); + const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>(); if (STI.dumpCode()) { // Disassemble function name label to text. DisasmLines.push_back(MF->getName().str() + ":"); @@ -231,7 +277,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { } void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const { - const AMDGPUSubtarget &STI = MBB.getParent()->getSubtarget<AMDGPUSubtarget>(); + const GCNSubtarget &STI = MBB.getParent()->getSubtarget<GCNSubtarget>(); if (STI.dumpCode() && !isBlockOnlyReachableByFallthrough(&MBB)) { // Write a line for the basic block label if it is not only fallthrough. DisasmLines.push_back( @@ -283,11 +329,66 @@ void AMDGPUAsmPrinter::emitCommonFunctionComments( uint32_t NumVGPR, uint32_t NumSGPR, uint64_t ScratchSize, - uint64_t CodeSize) { + uint64_t CodeSize, + const AMDGPUMachineFunction *MFI) { OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false); OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false); OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false); + OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()), + false); +} + +uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( + const MachineFunction &MF) const { + const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + uint16_t KernelCodeProperties = 0; + + if (MFI.hasPrivateSegmentBuffer()) { + KernelCodeProperties |= + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; + } + if (MFI.hasDispatchPtr()) { + KernelCodeProperties |= + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + } + if (MFI.hasQueuePtr()) { + KernelCodeProperties |= + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; + } + if (MFI.hasKernargSegmentPtr()) { + KernelCodeProperties |= + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; + } + if (MFI.hasDispatchID()) { + KernelCodeProperties |= + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; + } + if (MFI.hasFlatScratchInit()) { + KernelCodeProperties |= + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; + } + + return KernelCodeProperties; +} + +amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor( + const MachineFunction &MF, + const SIProgramInfo &PI) const { + amdhsa::kernel_descriptor_t KernelDescriptor; + memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor)); + + assert(isUInt<32>(PI.ScratchSize)); + assert(isUInt<32>(PI.ComputePGMRSrc1)); + assert(isUInt<32>(PI.ComputePGMRSrc2)); + + KernelDescriptor.group_segment_fixed_size = PI.LDSSize; + KernelDescriptor.private_segment_fixed_size = PI.ScratchSize; + KernelDescriptor.compute_pgm_rsrc1 = PI.ComputePGMRSrc1; + KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2; + KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF); + + return KernelDescriptor; } bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { @@ -301,32 +402,29 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { SetupMachineFunction(MF); - const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); + const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); MCContext &Context = getObjFileLowering().getContext(); - if (!STM.isAmdHsaOS()) { + // FIXME: This should be an explicit check for Mesa. + if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) { MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); OutStreamer->SwitchSection(ConfigSection); } - if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - if (MFI->isEntryFunction()) { - getSIProgramInfo(CurrentProgramInfo, MF); - } else { - auto I = CallGraphResourceInfo.insert( - std::make_pair(&MF.getFunction(), SIFunctionResourceInfo())); - SIFunctionResourceInfo &Info = I.first->second; - assert(I.second && "should only be called once per function"); - Info = analyzeResourceUsage(MF); - } - - if (STM.isAmdPalOS()) - EmitPALMetadata(MF, CurrentProgramInfo); - if (!STM.isAmdHsaOS()) { - EmitProgramInfoSI(MF, CurrentProgramInfo); - } + if (MFI->isEntryFunction()) { + getSIProgramInfo(CurrentProgramInfo, MF); } else { - EmitProgramInfoR600(MF); + auto I = CallGraphResourceInfo.insert( + std::make_pair(&MF.getFunction(), SIFunctionResourceInfo())); + SIFunctionResourceInfo &Info = I.first->second; + assert(I.second && "should only be called once per function"); + Info = analyzeResourceUsage(MF); + } + + if (STM.isAmdPalOS()) + EmitPALMetadata(MF, CurrentProgramInfo); + else if (!STM.isAmdHsaOS()) { + EmitProgramInfoSI(MF, CurrentProgramInfo); } DisasmLines.clear(); @@ -340,84 +438,74 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); OutStreamer->SwitchSection(CommentSection); - if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - if (!MFI->isEntryFunction()) { - OutStreamer->emitRawComment(" Function info:", false); - SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()]; - emitCommonFunctionComments( - Info.NumVGPR, - Info.getTotalNumSGPRs(MF.getSubtarget<SISubtarget>()), - Info.PrivateSegmentSize, - getFunctionCodeSize(MF)); - return false; - } - - OutStreamer->emitRawComment(" Kernel info:", false); - emitCommonFunctionComments(CurrentProgramInfo.NumVGPR, - CurrentProgramInfo.NumSGPR, - CurrentProgramInfo.ScratchSize, - getFunctionCodeSize(MF)); - - OutStreamer->emitRawComment( - " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); - OutStreamer->emitRawComment( - " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false); - OutStreamer->emitRawComment( - " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) + - " bytes/workgroup (compile time only)", false); - - OutStreamer->emitRawComment( - " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false); - OutStreamer->emitRawComment( - " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false); - - OutStreamer->emitRawComment( - " NumSGPRsForWavesPerEU: " + - Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false); - OutStreamer->emitRawComment( - " NumVGPRsForWavesPerEU: " + - Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false); - - OutStreamer->emitRawComment( - " ReservedVGPRFirst: " + Twine(CurrentProgramInfo.ReservedVGPRFirst), - false); - OutStreamer->emitRawComment( - " ReservedVGPRCount: " + Twine(CurrentProgramInfo.ReservedVGPRCount), - false); - - if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) { - OutStreamer->emitRawComment( - " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" + - Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false); - OutStreamer->emitRawComment( - " DebuggerPrivateSegmentBufferSGPR: s" + - Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false); - } + if (!MFI->isEntryFunction()) { + OutStreamer->emitRawComment(" Function info:", false); + SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()]; + emitCommonFunctionComments( + Info.NumVGPR, + Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()), + Info.PrivateSegmentSize, + getFunctionCodeSize(MF), MFI); + return false; + } + OutStreamer->emitRawComment(" Kernel info:", false); + emitCommonFunctionComments(CurrentProgramInfo.NumVGPR, + CurrentProgramInfo.NumSGPR, + CurrentProgramInfo.ScratchSize, + getFunctionCodeSize(MF), MFI); + + OutStreamer->emitRawComment( + " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); + OutStreamer->emitRawComment( + " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false); + OutStreamer->emitRawComment( + " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) + + " bytes/workgroup (compile time only)", false); + + OutStreamer->emitRawComment( + " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false); + OutStreamer->emitRawComment( + " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false); + + OutStreamer->emitRawComment( + " NumSGPRsForWavesPerEU: " + + Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false); + OutStreamer->emitRawComment( + " NumVGPRsForWavesPerEU: " + + Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false); + + OutStreamer->emitRawComment( + " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); + + if (MF.getSubtarget<GCNSubtarget>().debuggerEmitPrologue()) { OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:USER_SGPR: " + - Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false); - OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + - Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false); + " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" + + Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false); OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:TGID_X_EN: " + - Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); - OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:TGID_Y_EN: " + - Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); - OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:TGID_Z_EN: " + - Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); - OutStreamer->emitRawComment( - " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + - Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)), - false); - } else { - R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); - OutStreamer->emitRawComment( - Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->CFStackSize))); + " DebuggerPrivateSegmentBufferSGPR: s" + + Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false); } + + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC2:USER_SGPR: " + + Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false); + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + + Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false); + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC2:TGID_X_EN: " + + Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC2:TGID_Y_EN: " + + Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC2:TGID_Z_EN: " + + Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + + Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)), + false); } if (STM.dumpCode()) { @@ -440,67 +528,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { return false; } -void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { - unsigned MaxGPR = 0; - bool killPixel = false; - const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>(); - const R600RegisterInfo *RI = STM.getRegisterInfo(); - const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); - - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - if (MI.getOpcode() == AMDGPU::KILLGT) - killPixel = true; - unsigned numOperands = MI.getNumOperands(); - for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { - const MachineOperand &MO = MI.getOperand(op_idx); - if (!MO.isReg()) - continue; - unsigned HWReg = RI->getHWRegIndex(MO.getReg()); - - // Register with value > 127 aren't GPR - if (HWReg > 127) - continue; - MaxGPR = std::max(MaxGPR, HWReg); - } - } - } - - unsigned RsrcReg; - if (STM.getGeneration() >= R600Subtarget::EVERGREEN) { - // Evergreen / Northern Islands - switch (MF.getFunction().getCallingConv()) { - default: LLVM_FALLTHROUGH; - case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; - case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; - case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break; - case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break; - } - } else { - // R600 / R700 - switch (MF.getFunction().getCallingConv()) { - default: LLVM_FALLTHROUGH; - case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH; - case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH; - case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break; - case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break; - } - } - - OutStreamer->EmitIntValue(RsrcReg, 4); - OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) | - S_STACK_SIZE(MFI->CFStackSize), 4); - OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); - OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); - - if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { - OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); - OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4); - } -} - uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const { - const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = STM.getInstrInfo(); uint64_t CodeSize = 0; @@ -510,7 +539,7 @@ uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const // TODO: CodeSize should account for multiple functions. // TODO: Should we count size of debug info? - if (MI.isDebugValue()) + if (MI.isDebugInstr()) continue; CodeSize += TII->getInstSizeInBytes(MI); @@ -531,30 +560,10 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, return false; } -static unsigned getNumExtraSGPRs(const SISubtarget &ST, - bool VCCUsed, - bool FlatScrUsed) { - unsigned ExtraSGPRs = 0; - if (VCCUsed) - ExtraSGPRs = 2; - - if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) { - if (FlatScrUsed) - ExtraSGPRs = 4; - } else { - if (ST.isXNACKEnabled()) - ExtraSGPRs = 4; - - if (FlatScrUsed) - ExtraSGPRs = 6; - } - - return ExtraSGPRs; -} - int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs( - const SISubtarget &ST) const { - return NumExplicitSGPR + getNumExtraSGPRs(ST, UsesVCC, UsesFlatScratch); + const GCNSubtarget &ST) const { + return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(ST.getFeatureBits(), + UsesVCC, UsesFlatScratch); } AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( @@ -562,7 +571,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( SIFunctionResourceInfo Info; const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); const MachineRegisterInfo &MRI = MF.getRegInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -586,6 +595,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); Info.PrivateSegmentSize = FrameInfo.getStackSize(); + if (MFI->isStackRealigned()) + Info.PrivateSegmentSize += FrameInfo.getMaxAlignment(); Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || @@ -649,7 +660,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( continue; case AMDGPU::NoRegister: - assert(MI.isDebugValue()); + assert(MI.isDebugInstr()); continue; case AMDGPU::VCC: @@ -663,6 +674,11 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( case AMDGPU::FLAT_SCR_HI: continue; + case AMDGPU::XNACK_MASK: + case AMDGPU::XNACK_MASK_LO: + case AMDGPU::XNACK_MASK_HI: + llvm_unreachable("xnack_mask registers should not be used"); + case AMDGPU::TBA: case AMDGPU::TBA_LO: case AMDGPU::TBA_HI: @@ -742,8 +758,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( // conservative guesses. // 48 SGPRs - vcc, - flat_scr, -xnack - int MaxSGPRGuess = 47 - getNumExtraSGPRs(ST, true, - ST.hasFlatAddressSpace()); + int MaxSGPRGuess = + 47 - IsaInfo::getNumExtraSGPRs(ST.getFeatureBits(), true, + ST.hasFlatAddressSpace()); MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); MaxVGPR = std::max(MaxVGPR, 23); @@ -798,15 +815,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, MF.getFunction().getContext().diagnose(DiagStackSize); } - const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); const SIInstrInfo *TII = STM.getInstrInfo(); const SIRegisterInfo *RI = &TII->getRegisterInfo(); - unsigned ExtraSGPRs = getNumExtraSGPRs(STM, - ProgInfo.VCCUsed, - ProgInfo.FlatUsed); - unsigned ExtraVGPRs = STM.getReservedNumVGPRs(MF); + // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are + // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be + // unified. + unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs( + STM.getFeatureBits(), ProgInfo.VCCUsed, ProgInfo.FlatUsed); // Check the addressable register limit before we add ExtraSGPRs. if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && @@ -827,7 +845,19 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // Account for extra SGPRs and VGPRs reserved for debugger use. ProgInfo.NumSGPR += ExtraSGPRs; - ProgInfo.NumVGPR += ExtraVGPRs; + + // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave + // dispatch registers are function args. + unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0; + for (auto &Arg : MF.getFunction().args()) { + unsigned NumRegs = (Arg.getType()->getPrimitiveSizeInBits() + 31) / 32; + if (Arg.hasAttribute(Attribute::InReg)) + WaveDispatchNumSGPR += NumRegs; + else + WaveDispatchNumVGPR += NumRegs; + } + ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR); + ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR); // Adjust number of registers used to meet default/requested minimum/maximum // number of waves per execution unit request. @@ -875,19 +905,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, Ctx.diagnose(Diag); } - // SGPRBlocks is actual number of SGPR blocks minus 1. - ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU, - STM.getSGPREncodingGranule()); - ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / STM.getSGPREncodingGranule() - 1; - - // VGPRBlocks is actual number of VGPR blocks minus 1. - ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU, - STM.getVGPREncodingGranule()); - ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / STM.getVGPREncodingGranule() - 1; - - // Record first reserved VGPR and number of reserved VGPRs. - ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? ProgInfo.NumVGPR : 0; - ProgInfo.ReservedVGPRCount = STM.getReservedNumVGPRs(MF); + ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks( + STM.getFeatureBits(), ProgInfo.NumSGPRsForWavesPerEU); + ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks( + STM.getFeatureBits(), ProgInfo.NumVGPRsForWavesPerEU); // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue" @@ -909,7 +930,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.DX10Clamp = STM.enableDX10Clamp(); unsigned LDSAlignShift; - if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) { + if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { // LDS is allocated in 64 dword blocks. LDSAlignShift = 8; } else { @@ -954,7 +975,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.ComputePGMRSrc2 = S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | - S_00B84C_TRAP_HANDLER(STM.isTrapHandlerEnabled()) | + // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP. + S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) | S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) | S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) | S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) | @@ -981,7 +1003,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) { void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) { - const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv()); @@ -1002,26 +1024,21 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer->EmitIntValue(RsrcReg, 4); OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4); - unsigned Rsrc2Val = 0; if (STM.isVGPRSpillingEnabled(MF.getFunction())) { OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); - if (TM.getTargetTriple().getOS() == Triple::AMDPAL) - Rsrc2Val = S_00B84C_SCRATCH_EN(CurrentProgramInfo.ScratchBlocks > 0); - } - if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { - OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); - OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4); - OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); - OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); - Rsrc2Val |= S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks); - } - if (Rsrc2Val) { - OutStreamer->EmitIntValue(RsrcReg + 4 /*rsrc2*/, 4); - OutStreamer->EmitIntValue(Rsrc2Val, 4); } } + if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { + OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); + OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4); + OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); + OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4); + OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); + OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); + } + OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4); OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4); OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4); @@ -1114,8 +1131,12 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &CurrentProgramInfo, const MachineFunction &MF) const { + const Function &F = MF.getFunction(); + assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || + F.getCallingConv() == CallingConv::SPIR_KERNEL); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits()); @@ -1151,21 +1172,6 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, if (MFI->hasFlatScratchInit()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; - if (MFI->hasGridWorkgroupCountX()) { - Out.code_properties |= - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X; - } - - if (MFI->hasGridWorkgroupCountY()) { - Out.code_properties |= - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y; - } - - if (MFI->hasGridWorkgroupCountZ()) { - Out.code_properties |= - AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z; - } - if (MFI->hasDispatchPtr()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; @@ -1175,20 +1181,17 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, if (STM.isXNACKEnabled()) Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; - // FIXME: Should use getKernArgSize - Out.kernarg_segment_byte_size = - STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset()); + unsigned MaxKernArgAlign; + Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR; Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR; Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize; Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize; - Out.reserved_vgpr_first = CurrentProgramInfo.ReservedVGPRFirst; - Out.reserved_vgpr_count = CurrentProgramInfo.ReservedVGPRCount; // These alignment values are specified in powers of two, so alignment = // 2^n. The minimum alignment is 2^4 = 16. Out.kernarg_segment_alignment = std::max((size_t)4, - countTrailingZeros(MFI->getMaxKernArgAlign())); + countTrailingZeros(MaxKernArgAlign)); if (STM.debuggerEmitPrologue()) { Out.debug_wavefront_private_segment_offset_sgpr = @@ -1198,55 +1201,6 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, } } -AMDGPU::HSAMD::Kernel::CodeProps::Metadata AMDGPUAsmPrinter::getHSACodeProps( - const MachineFunction &MF, - const SIProgramInfo &ProgramInfo) const { - const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); - const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); - HSAMD::Kernel::CodeProps::Metadata HSACodeProps; - - HSACodeProps.mKernargSegmentSize = - STM.getKernArgSegmentSize(MF, MFI.getABIArgOffset()); - HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize; - HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize; - HSACodeProps.mKernargSegmentAlign = - std::max(uint32_t(4), MFI.getMaxKernArgAlign()); - HSACodeProps.mWavefrontSize = STM.getWavefrontSize(); - HSACodeProps.mNumSGPRs = CurrentProgramInfo.NumSGPR; - HSACodeProps.mNumVGPRs = CurrentProgramInfo.NumVGPR; - HSACodeProps.mMaxFlatWorkGroupSize = MFI.getMaxFlatWorkGroupSize(); - HSACodeProps.mIsDynamicCallStack = ProgramInfo.DynamicCallStack; - HSACodeProps.mIsXNACKEnabled = STM.isXNACKEnabled(); - HSACodeProps.mNumSpilledSGPRs = MFI.getNumSpilledSGPRs(); - HSACodeProps.mNumSpilledVGPRs = MFI.getNumSpilledVGPRs(); - - return HSACodeProps; -} - -AMDGPU::HSAMD::Kernel::DebugProps::Metadata AMDGPUAsmPrinter::getHSADebugProps( - const MachineFunction &MF, - const SIProgramInfo &ProgramInfo) const { - const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); - HSAMD::Kernel::DebugProps::Metadata HSADebugProps; - - if (!STM.debuggerSupported()) - return HSADebugProps; - - HSADebugProps.mDebuggerABIVersion.push_back(1); - HSADebugProps.mDebuggerABIVersion.push_back(0); - HSADebugProps.mReservedNumVGPRs = ProgramInfo.ReservedVGPRCount; - HSADebugProps.mReservedFirstVGPR = ProgramInfo.ReservedVGPRFirst; - - if (STM.debuggerEmitPrologue()) { - HSADebugProps.mPrivateSegmentBufferSGPR = - ProgramInfo.DebuggerPrivateSegmentBufferSGPR; - HSADebugProps.mWavefrontPrivateSegmentOffsetSGPR = - ProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; - } - - return HSADebugProps; -} - bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 51d48a0c7320..22982d912c70 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief AMDGPU Assembly printer class. +/// AMDGPU Assembly printer class. // //===----------------------------------------------------------------------===// @@ -17,9 +17,11 @@ #include "AMDGPU.h" #include "AMDKernelCodeT.h" -#include "MCTargetDesc/AMDGPUHSAMetadataStreamer.h" +#include "AMDGPUHSAMetadataStreamer.h" +#include "SIProgramInfo.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/Support/AMDHSAKernelDescriptor.h" #include <cstddef> #include <cstdint> #include <limits> @@ -29,9 +31,10 @@ namespace llvm { +class AMDGPUMachineFunction; class AMDGPUTargetStreamer; class MCOperand; -class SISubtarget; +class GCNSubtarget; class AMDGPUAsmPrinter final : public AsmPrinter { private: @@ -47,68 +50,7 @@ private: bool HasDynamicallySizedStack = false; bool HasRecursion = false; - int32_t getTotalNumSGPRs(const SISubtarget &ST) const; - }; - - // Track resource usage for kernels / entry functions. - struct SIProgramInfo { - // Fields set in PGM_RSRC1 pm4 packet. - uint32_t VGPRBlocks = 0; - uint32_t SGPRBlocks = 0; - uint32_t Priority = 0; - uint32_t FloatMode = 0; - uint32_t Priv = 0; - uint32_t DX10Clamp = 0; - uint32_t DebugMode = 0; - uint32_t IEEEMode = 0; - uint64_t ScratchSize = 0; - - uint64_t ComputePGMRSrc1 = 0; - - // Fields set in PGM_RSRC2 pm4 packet. - uint32_t LDSBlocks = 0; - uint32_t ScratchBlocks = 0; - - uint64_t ComputePGMRSrc2 = 0; - - uint32_t NumVGPR = 0; - uint32_t NumSGPR = 0; - uint32_t LDSSize = 0; - bool FlatUsed = false; - - // Number of SGPRs that meets number of waves per execution unit request. - uint32_t NumSGPRsForWavesPerEU = 0; - - // Number of VGPRs that meets number of waves per execution unit request. - uint32_t NumVGPRsForWavesPerEU = 0; - - // If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first - // fixed VGPR number reserved. - uint16_t ReservedVGPRFirst = 0; - - // The number of consecutive VGPRs reserved. - uint16_t ReservedVGPRCount = 0; - - // Fixed SGPR number used to hold wave scratch offset for entire kernel - // execution, or std::numeric_limits<uint16_t>::max() if the register is not - // used or not known. - uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR = - std::numeric_limits<uint16_t>::max(); - - // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire - // kernel execution, or std::numeric_limits<uint16_t>::max() if the register - // is not used or not known. - uint16_t DebuggerPrivateSegmentBufferSGPR = - std::numeric_limits<uint16_t>::max(); - - // Whether there is recursion, dynamic allocas, indirect calls or some other - // reason there may be statically unknown stack usage. - bool DynamicCallStack = false; - - // Bonus information for debugging. - bool VCCUsed = false; - - SIProgramInfo() = default; + int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const; }; SIProgramInfo CurrentProgramInfo; @@ -128,16 +70,8 @@ private: unsigned &NumSGPR, unsigned &NumVGPR) const; - AMDGPU::HSAMD::Kernel::CodeProps::Metadata getHSACodeProps( - const MachineFunction &MF, - const SIProgramInfo &ProgramInfo) const; - AMDGPU::HSAMD::Kernel::DebugProps::Metadata getHSADebugProps( - const MachineFunction &MF, - const SIProgramInfo &ProgramInfo) const; - - /// \brief Emit register usage information so that the GPU driver + /// Emit register usage information so that the GPU driver /// can correctly setup the GPU state. - void EmitProgramInfoR600(const MachineFunction &MF); void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo); void EmitPALMetadata(const MachineFunction &MF, @@ -145,7 +79,15 @@ private: void emitCommonFunctionComments(uint32_t NumVGPR, uint32_t NumSGPR, uint64_t ScratchSize, - uint64_t CodeSize); + uint64_t CodeSize, + const AMDGPUMachineFunction* MFI); + + uint16_t getAmdhsaKernelCodeProperties( + const MachineFunction &MF) const; + + amdhsa::kernel_descriptor_t getAmdhsaKernelDescriptor( + const MachineFunction &MF, + const SIProgramInfo &PI) const; public: explicit AMDGPUAsmPrinter(TargetMachine &TM, @@ -160,16 +102,16 @@ public: bool doFinalization(Module &M) override; bool runOnMachineFunction(MachineFunction &MF) override; - /// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated + /// Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated /// pseudo lowering. bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const; - /// \brief Lower the specified LLVM Constant to an MCExpr. + /// Lower the specified LLVM Constant to an MCExpr. /// The AsmPrinter::lowerConstantof does not know how to lower /// addrspacecast, therefore they should be lowered by this function. const MCExpr *lowerConstant(const Constant *CV) override; - /// \brief tblgen'erated driver function for lowering simple MI->MC pseudo + /// tblgen'erated driver function for lowering simple MI->MC pseudo /// instructions. bool emitPseudoExpansionLowering(MCStreamer &OutStreamer, const MachineInstr *MI); @@ -179,6 +121,8 @@ public: void EmitFunctionBodyStart() override; + void EmitFunctionBodyEnd() override; + void EmitFunctionEntryLabel() override; void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override; diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 5a9138731934..18c7df0d94f2 100644 --- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -20,6 +20,7 @@ #include "SIISelLowering.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -32,13 +33,17 @@ AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, unsigned VReg) const { + // FIXME: Add support for non-void returns. + if (Val) + return false; + MIRBuilder.buildInstr(AMDGPU::S_ENDPGM); return true; } unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy, - unsigned Offset) const { + uint64_t Offset) const { MachineFunction &MF = MIRBuilder.getMF(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); @@ -61,7 +66,8 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, } void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder, - Type *ParamTy, unsigned Offset, + Type *ParamTy, uint64_t Offset, + unsigned Align, unsigned DstReg) const { MachineFunction &MF = MIRBuilder.getMF(); const Function &F = MF.getFunction(); @@ -69,7 +75,6 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder, PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); unsigned TypeSize = DL.getTypeStoreSize(ParamTy); - unsigned Align = DL.getABITypeAlignment(ParamTy); unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset); MachineMemOperand *MMO = @@ -84,12 +89,16 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder, bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<unsigned> VRegs) const { + // AMDGPU_GS and AMDGP_HS are not supported yet. + if (F.getCallingConv() == CallingConv::AMDGPU_GS || + F.getCallingConv() == CallingConv::AMDGPU_HS) + return false; MachineFunction &MF = MIRBuilder.getMF(); - const SISubtarget *Subtarget = static_cast<const SISubtarget *>(&MF.getSubtarget()); + const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>(); MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo(); + const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); const DataLayout &DL = F.getParent()->getDataLayout(); SmallVector<CCValAssign, 16> ArgLocs; @@ -116,7 +125,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, if (Info->hasKernargSegmentPtr()) { unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); - const LLT P2 = LLT::pointer(2, 64); + const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); unsigned VReg = MRI.createGenericVirtualRegister(P2); MRI.addLiveIn(InputPtrReg, VReg); MIRBuilder.getMBB().addLiveIn(InputPtrReg); @@ -136,49 +145,106 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, CCInfo.AllocateReg(FlatScratchInitReg); } + // The infrastructure for normal calling convention lowering is essentially + // useless for kernels. We want to avoid any kind of legalization or argument + // splitting. + if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) { + unsigned i = 0; + const unsigned KernArgBaseAlign = 16; + const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); + uint64_t ExplicitArgOffset = 0; + + // TODO: Align down to dword alignment and extract bits for extending loads. + for (auto &Arg : F.args()) { + Type *ArgTy = Arg.getType(); + unsigned AllocSize = DL.getTypeAllocSize(ArgTy); + if (AllocSize == 0) + continue; + + unsigned ABIAlign = DL.getABITypeAlignment(ArgTy); + + uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; + ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; + + unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset); + ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy)); + lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, VRegs[i]); + ++i; + } + + return true; + } + unsigned NumArgs = F.arg_size(); Function::const_arg_iterator CurOrigArg = F.arg_begin(); const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>(); + unsigned PSInputNum = 0; + BitVector Skipped(NumArgs); for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) { EVT ValEVT = TLI.getValueType(DL, CurOrigArg->getType()); // We can only hanlde simple value types at the moment. - if (!ValEVT.isSimple()) - return false; - MVT ValVT = ValEVT.getSimpleVT(); ISD::ArgFlagsTy Flags; ArgInfo OrigArg{VRegs[i], CurOrigArg->getType()}; setArgFlags(OrigArg, i + 1, DL, F); Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType())); + + if (F.getCallingConv() == CallingConv::AMDGPU_PS && + !OrigArg.Flags.isInReg() && !OrigArg.Flags.isByVal() && + PSInputNum <= 15) { + if (CurOrigArg->use_empty() && !Info->isPSInputAllocated(PSInputNum)) { + Skipped.set(i); + ++PSInputNum; + continue; + } + + Info->markPSInputAllocated(PSInputNum); + if (!CurOrigArg->use_empty()) + Info->markPSInputEnabled(PSInputNum); + + ++PSInputNum; + } + CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false); - bool Res = - AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo); - // Fail if we don't know how to handle this type. - if (Res) - return false; + if (ValEVT.isVector()) { + EVT ElemVT = ValEVT.getVectorElementType(); + if (!ValEVT.isSimple()) + return false; + MVT ValVT = ElemVT.getSimpleVT(); + bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, + OrigArg.Flags, CCInfo); + if (!Res) + return false; + } else { + MVT ValVT = ValEVT.getSimpleVT(); + if (!ValEVT.isSimple()) + return false; + bool Res = + AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo); + + // Fail if we don't know how to handle this type. + if (Res) + return false; + } } Function::const_arg_iterator Arg = F.arg_begin(); - if (F.getCallingConv() == CallingConv::AMDGPU_VS) { - for (unsigned i = 0; i != NumArgs; ++i, ++Arg) { - CCValAssign &VA = ArgLocs[i]; - MRI.addLiveIn(VA.getLocReg(), VRegs[i]); + if (F.getCallingConv() == CallingConv::AMDGPU_VS || + F.getCallingConv() == CallingConv::AMDGPU_PS) { + for (unsigned i = 0, OrigArgIdx = 0; + OrigArgIdx != NumArgs && i != ArgLocs.size(); ++Arg, ++OrigArgIdx) { + if (Skipped.test(OrigArgIdx)) + continue; + CCValAssign &VA = ArgLocs[i++]; + MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx]); MIRBuilder.getMBB().addLiveIn(VA.getLocReg()); - MIRBuilder.buildCopy(VRegs[i], VA.getLocReg()); + MIRBuilder.buildCopy(VRegs[OrigArgIdx], VA.getLocReg()); } return true; } - for (unsigned i = 0; i != NumArgs; ++i, ++Arg) { - // FIXME: We should be getting DebugInfo from the arguments some how. - CCValAssign &VA = ArgLocs[i]; - lowerParameter(MIRBuilder, Arg->getType(), - VA.getLocMemOffset() + - Subtarget->getExplicitKernelArgOffset(MF), VRegs[i]); - } - - return true; + return false; } diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h index 251cb7a2c440..f51cb6abbf65 100644 --- a/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -26,10 +26,11 @@ class AMDGPUCallLowering: public CallLowering { AMDGPUAS AMDGPUASI; unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy, - unsigned Offset) const; + uint64_t Offset) const; void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy, - unsigned Offset, unsigned DstReg) const; + uint64_t Offset, unsigned Align, + unsigned DstReg) const; public: AMDGPUCallLowering(const AMDGPUTargetLowering &TLI); diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td index c1c066fd1404..68bc7fdd9961 100644 --- a/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -85,22 +85,6 @@ def RetCC_SI_Shader : CallingConv<[ ]>> ]>; -// Calling convention for R600 -def CC_R600 : CallingConv<[ - CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[ - T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW, - T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW, - T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW, - T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW, - T30_XYZW, T31_XYZW, T32_XYZW - ]>>> -]>; - -// Calling convention for compute kernels -def CC_AMDGPU_Kernel : CallingConv<[ - CCCustom<"allocateKernArg"> -]>; - def CSR_AMDGPU_VGPRs_24_255 : CalleeSavedRegs< (sequence "VGPR%u", 24, 255) >; @@ -127,7 +111,7 @@ def CC_AMDGPU_Func : CallingConv<[ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, - CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>, + CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>, CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>, CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>, CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>, @@ -144,30 +128,16 @@ def RetCC_AMDGPU_Func : CallingConv<[ VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, - CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">> + CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">> ]>; def CC_AMDGPU : CallingConv<[ - CCIf<"static_cast<const AMDGPUSubtarget&>" - "(State.getMachineFunction().getSubtarget()).getGeneration() >=" - "AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "!AMDGPU::isShader(State.getCallingConv())", - CCDelegateTo<CC_AMDGPU_Kernel>>, - CCIf<"static_cast<const AMDGPUSubtarget&>" - "(State.getMachineFunction().getSubtarget()).getGeneration() < " - "AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "!AMDGPU::isShader(State.getCallingConv())", - CCDelegateTo<CC_AMDGPU_Kernel>>, - CCIf<"static_cast<const AMDGPUSubtarget&>" + CCIf<"static_cast<const GCNSubtarget&>" "(State.getMachineFunction().getSubtarget()).getGeneration() >= " "AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_SI>>, - CCIf<"static_cast<const AMDGPUSubtarget&>" + CCIf<"static_cast<const GCNSubtarget&>" "(State.getMachineFunction().getSubtarget()).getGeneration() >= " "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C", - CCDelegateTo<CC_AMDGPU_Func>>, - CCIf<"static_cast<const AMDGPUSubtarget&>" - "(State.getMachineFunction().getSubtarget()).getGeneration() < " - "AMDGPUSubtarget::SOUTHERN_ISLANDS", - CCDelegateTo<CC_R600>> + CCDelegateTo<CC_AMDGPU_Func>> ]>; diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index b17b67167666..5713b7b7f9a8 100644 --- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -17,8 +17,10 @@ #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" @@ -48,15 +50,22 @@ using namespace llvm; namespace { +static cl::opt<bool> WidenLoads( + "amdgpu-codegenprepare-widen-constant-loads", + cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, + cl::init(true)); + class AMDGPUCodeGenPrepare : public FunctionPass, public InstVisitor<AMDGPUCodeGenPrepare, bool> { - const SISubtarget *ST = nullptr; + const GCNSubtarget *ST = nullptr; + AssumptionCache *AC = nullptr; DivergenceAnalysis *DA = nullptr; Module *Mod = nullptr; bool HasUnsafeFPMath = false; AMDGPUAS AMDGPUASI; - /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to + /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to /// binary operation \p V. /// /// \returns Binary operation \p V. @@ -80,7 +89,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass, /// false otherwise. bool needsPromotionToI32(const Type *T) const; - /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary + /// Promotes uniform binary operation \p I to equivalent 32 bit binary /// operation. /// /// \details \p I's base element bit width must be greater than 1 and less @@ -93,7 +102,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass, /// false otherwise. bool promoteUniformOpToI32(BinaryOperator &I) const; - /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. + /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. /// /// \details \p I's base element bit width must be greater than 1 and less /// than or equal 16. Promotion is done by sign or zero extending operands to @@ -102,7 +111,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass, /// \returns True. bool promoteUniformOpToI32(ICmpInst &I) const; - /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select' + /// Promotes uniform 'select' operation \p I to 32 bit 'select' /// operation. /// /// \details \p I's base element bit width must be greater than 1 and less @@ -113,7 +122,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass, /// \returns True. bool promoteUniformOpToI32(SelectInst &I) const; - /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' + /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' /// intrinsic. /// /// \details \p I's base element bit width must be greater than 1 and less @@ -125,7 +134,17 @@ class AMDGPUCodeGenPrepare : public FunctionPass, /// /// \returns True. bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; - /// \brief Widen a scalar load. + + /// Expands 24 bit div or rem. + Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I, + Value *Num, Value *Den, + bool IsDiv, bool IsSigned) const; + + /// Expands 32 bit div or rem. + Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I, + Value *Num, Value *Den) const; + + /// Widen a scalar load. /// /// \details \p Widen scalar load for uniform, small type loads from constant // memory / to a full 32-bits and then truncate the input to allow a scalar @@ -157,6 +176,7 @@ public: StringRef getPassName() const override { return "AMDGPU IR optimizations"; } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DivergenceAnalysis>(); AU.setPreservesAll(); } @@ -250,7 +270,9 @@ bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { "I does not need promotion to i32"); if (I.getOpcode() == Instruction::SDiv || - I.getOpcode() == Instruction::UDiv) + I.getOpcode() == Instruction::UDiv || + I.getOpcode() == Instruction::SRem || + I.getOpcode() == Instruction::URem) return false; IRBuilder<> Builder(&I); @@ -372,13 +394,18 @@ bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( return true; } -static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { +static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) { const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); if (!CNum) - return false; + return HasDenormals; + + if (UnsafeDiv) + return true; + + bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0); // Reciprocal f32 is handled separately without denormals. - return UnsafeDiv || CNum->isExactlyValue(+1.0); + return HasDenormals ^ IsOne; } // Insert an intrinsic for fast fdiv for safe math situations where we can @@ -404,7 +431,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { FMF.allowReciprocal(); // With UnsafeDiv node will be optimized to just rcp and mul. - if (ST->hasFP32Denormals() || UnsafeDiv) + if (UnsafeDiv) return false; IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); @@ -418,6 +445,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { Value *NewFDiv = nullptr; + bool HasDenormals = ST->hasFP32Denormals(); if (VectorType *VT = dyn_cast<VectorType>(Ty)) { NewFDiv = UndefValue::get(VT); @@ -428,7 +456,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { Value *DenEltI = Builder.CreateExtractElement(Den, I); Value *NewElt; - if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { + if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) { NewElt = Builder.CreateFDiv(NumEltI, DenEltI); } else { NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); @@ -437,7 +465,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); } } else { - if (!shouldKeepFDivF32(Num, UnsafeDiv)) + if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals)) NewFDiv = Builder.CreateCall(Decl, { Num, Den }); } @@ -447,7 +475,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { FDiv.eraseFromParent(); } - return true; + return !!NewFDiv; } static bool hasUnsafeFPMath(const Function &F) { @@ -455,18 +483,324 @@ static bool hasUnsafeFPMath(const Function &F) { return Attr.getValueAsString() == "true"; } +static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder, + Value *LHS, Value *RHS) { + Type *I32Ty = Builder.getInt32Ty(); + Type *I64Ty = Builder.getInt64Ty(); + + Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty); + Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty); + Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64); + Value *Lo = Builder.CreateTrunc(MUL64, I32Ty); + Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32)); + Hi = Builder.CreateTrunc(Hi, I32Ty); + return std::make_pair(Lo, Hi); +} + +static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) { + return getMul64(Builder, LHS, RHS).second; +} + +// The fractional part of a float is enough to accurately represent up to +// a 24-bit signed integer. +Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, + BinaryOperator &I, + Value *Num, Value *Den, + bool IsDiv, bool IsSigned) const { + assert(Num->getType()->isIntegerTy(32)); + + const DataLayout &DL = Mod->getDataLayout(); + unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I); + if (LHSSignBits < 9) + return nullptr; + + unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I); + if (RHSSignBits < 9) + return nullptr; + + + unsigned SignBits = std::min(LHSSignBits, RHSSignBits); + unsigned DivBits = 32 - SignBits; + if (IsSigned) + ++DivBits; + + Type *Ty = Num->getType(); + Type *I32Ty = Builder.getInt32Ty(); + Type *F32Ty = Builder.getFloatTy(); + ConstantInt *One = Builder.getInt32(1); + Value *JQ = One; + + if (IsSigned) { + // char|short jq = ia ^ ib; + JQ = Builder.CreateXor(Num, Den); + + // jq = jq >> (bitsize - 2) + JQ = Builder.CreateAShr(JQ, Builder.getInt32(30)); + + // jq = jq | 0x1 + JQ = Builder.CreateOr(JQ, One); + } + + // int ia = (int)LHS; + Value *IA = Num; + + // int ib, (int)RHS; + Value *IB = Den; + + // float fa = (float)ia; + Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty) + : Builder.CreateUIToFP(IA, F32Ty); + + // float fb = (float)ib; + Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty) + : Builder.CreateUIToFP(IB,F32Ty); + + Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB); + Value *FQM = Builder.CreateFMul(FA, RCP); + + // fq = trunc(fqm); + CallInst* FQ = Builder.CreateIntrinsic(Intrinsic::trunc, { FQM }); + FQ->copyFastMathFlags(Builder.getFastMathFlags()); + + // float fqneg = -fq; + Value *FQNeg = Builder.CreateFNeg(FQ); + + // float fr = mad(fqneg, fb, fa); + Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz, + { FQNeg, FB, FA }, FQ); + + // int iq = (int)fq; + Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty) + : Builder.CreateFPToUI(FQ, I32Ty); + + // fr = fabs(fr); + FR = Builder.CreateIntrinsic(Intrinsic::fabs, { FR }, FQ); + + // fb = fabs(fb); + FB = Builder.CreateIntrinsic(Intrinsic::fabs, { FB }, FQ); + + // int cv = fr >= fb; + Value *CV = Builder.CreateFCmpOGE(FR, FB); + + // jq = (cv ? jq : 0); + JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0)); + + // dst = iq + jq; + Value *Div = Builder.CreateAdd(IQ, JQ); + + Value *Res = Div; + if (!IsDiv) { + // Rem needs compensation, it's easier to recompute it + Value *Rem = Builder.CreateMul(Div, Den); + Res = Builder.CreateSub(Num, Rem); + } + + // Truncate to number of bits this divide really is. + if (IsSigned) { + Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits)); + Res = Builder.CreateSExt(Res, Ty); + } else { + ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1); + Res = Builder.CreateAnd(Res, TruncMask); + } + + return Res; +} + +Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, + BinaryOperator &I, + Value *Num, Value *Den) const { + Instruction::BinaryOps Opc = I.getOpcode(); + assert(Opc == Instruction::URem || Opc == Instruction::UDiv || + Opc == Instruction::SRem || Opc == Instruction::SDiv); + + FastMathFlags FMF; + FMF.setFast(); + Builder.setFastMathFlags(FMF); + + if (isa<Constant>(Den)) + return nullptr; // Keep it for optimization + + bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv; + bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv; + + Type *Ty = Num->getType(); + Type *I32Ty = Builder.getInt32Ty(); + Type *F32Ty = Builder.getFloatTy(); + + if (Ty->getScalarSizeInBits() < 32) { + if (IsSigned) { + Num = Builder.CreateSExt(Num, I32Ty); + Den = Builder.CreateSExt(Den, I32Ty); + } else { + Num = Builder.CreateZExt(Num, I32Ty); + Den = Builder.CreateZExt(Den, I32Ty); + } + } + + if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) { + Res = Builder.CreateTrunc(Res, Ty); + return Res; + } + + ConstantInt *Zero = Builder.getInt32(0); + ConstantInt *One = Builder.getInt32(1); + ConstantInt *MinusOne = Builder.getInt32(~0); + + Value *Sign = nullptr; + if (IsSigned) { + ConstantInt *K31 = Builder.getInt32(31); + Value *LHSign = Builder.CreateAShr(Num, K31); + Value *RHSign = Builder.CreateAShr(Den, K31); + // Remainder sign is the same as LHS + Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign; + + Num = Builder.CreateAdd(Num, LHSign); + Den = Builder.CreateAdd(Den, RHSign); + + Num = Builder.CreateXor(Num, LHSign); + Den = Builder.CreateXor(Den, RHSign); + } + + // RCP = URECIP(Den) = 2^32 / Den + e + // e is rounding error. + Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty); + Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32); + Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000)); + Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1); + Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty); + + // RCP_LO, RCP_HI = mul(RCP, Den) */ + Value *RCP_LO, *RCP_HI; + std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den); + + // NEG_RCP_LO = -RCP_LO + Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO); + + // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) + Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero); + Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO); + + // Calculate the rounding error from the URECIP instruction + // E = mulhu(ABS_RCP_LO, RCP) + Value *E = getMulHu(Builder, ABS_RCP_LO, RCP); + + // RCP_A_E = RCP + E + Value *RCP_A_E = Builder.CreateAdd(RCP, E); + + // RCP_S_E = RCP - E + Value *RCP_S_E = Builder.CreateSub(RCP, E); + + // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) + Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E); + + // Quotient = mulhu(Tmp0, Num) + Value *Quotient = getMulHu(Builder, Tmp0, Num); + + // Num_S_Remainder = Quotient * Den + Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den); + + // Remainder = Num - Num_S_Remainder + Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder); + + // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) + Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den); + Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero); + + // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) + Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder); + Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, + MinusOne, Zero); + + // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero + Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero); + Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero); + + Value *Res; + if (IsDiv) { + // Quotient_A_One = Quotient + 1 + Value *Quotient_A_One = Builder.CreateAdd(Quotient, One); + + // Quotient_S_One = Quotient - 1 + Value *Quotient_S_One = Builder.CreateSub(Quotient, One); + + // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) + Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One); + + // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) + Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One); + } else { + // Remainder_S_Den = Remainder - Den + Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den); + + // Remainder_A_Den = Remainder + Den + Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den); + + // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) + Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den); + + // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) + Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den); + } + + if (IsSigned) { + Res = Builder.CreateXor(Res, Sign); + Res = Builder.CreateSub(Res, Sign); + } + + Res = Builder.CreateTrunc(Res, Ty); + + return Res; +} + bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { + if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && + DA->isUniform(&I) && promoteUniformOpToI32(I)) + return true; + bool Changed = false; + Instruction::BinaryOps Opc = I.getOpcode(); + Type *Ty = I.getType(); + Value *NewDiv = nullptr; + if ((Opc == Instruction::URem || Opc == Instruction::UDiv || + Opc == Instruction::SRem || Opc == Instruction::SDiv) && + Ty->getScalarSizeInBits() <= 32) { + Value *Num = I.getOperand(0); + Value *Den = I.getOperand(1); + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); - if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && - DA->isUniform(&I)) - Changed |= promoteUniformOpToI32(I); + if (VectorType *VT = dyn_cast<VectorType>(Ty)) { + NewDiv = UndefValue::get(VT); + + for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) { + Value *NumEltN = Builder.CreateExtractElement(Num, N); + Value *DenEltN = Builder.CreateExtractElement(Den, N); + Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN); + if (!NewElt) + NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); + NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N); + } + } else { + NewDiv = expandDivRem32(Builder, I, Num, Den); + } + + if (NewDiv) { + I.replaceAllUsesWith(NewDiv); + I.eraseFromParent(); + Changed = true; + } + } return Changed; } -bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { - if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && +bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { + if (!WidenLoads) + return false; + + if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && canWidenScalarExtLoad(I)) { IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); @@ -474,7 +808,28 @@ bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { Type *I32Ty = Builder.getInt32Ty(); Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); - Value *WidenLoad = Builder.CreateLoad(BitCast); + LoadInst *WidenLoad = Builder.CreateLoad(BitCast); + WidenLoad->copyMetadata(I); + + // If we have range metadata, we need to convert the type, and not make + // assumptions about the high bits. + if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) { + ConstantInt *Lower = + mdconst::extract<ConstantInt>(Range->getOperand(0)); + + if (Lower->getValue().isNullValue()) { + WidenLoad->setMetadata(LLVMContext::MD_range, nullptr); + } else { + Metadata *LowAndHigh[] = { + ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))), + // Don't make assumptions about the high bits. + ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0)) + }; + + WidenLoad->setMetadata(LLVMContext::MD_range, + MDNode::get(Mod->getContext(), LowAndHigh)); + } + } int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); Type *IntNTy = Builder.getIntNTy(TySize); @@ -540,10 +895,12 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { if (!TPC) return false; - const TargetMachine &TM = TPC->getTM<TargetMachine>(); - ST = &TM.getSubtarget<SISubtarget>(F); + const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>(); + ST = &TM.getSubtarget<GCNSubtarget>(F); + AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); DA = &getAnalysis<DivergenceAnalysis>(); HasUnsafeFPMath = hasUnsafeFPMath(F); + AMDGPUASI = TM.getAMDGPUAS(); bool MadeChange = false; @@ -560,6 +917,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) diff --git a/lib/Target/AMDGPU/AMDGPUFeatures.td b/lib/Target/AMDGPU/AMDGPUFeatures.td new file mode 100644 index 000000000000..b375cae9018e --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUFeatures.td @@ -0,0 +1,60 @@ +//===-- AMDGPUFeatures.td - AMDGPU Feature Definitions -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +def FeatureFP64 : SubtargetFeature<"fp64", + "FP64", + "true", + "Enable double precision operations" +>; + +def FeatureFMA : SubtargetFeature<"fmaf", + "FMA", + "true", + "Enable single precision FMA (not as fast as mul+add, but fused)" +>; + +class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature< + "localmemorysize"#Value, + "LocalMemorySize", + !cast<string>(Value), + "The size of local memory in bytes" +>; + +def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>; +def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>; +def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>; + +class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature< + "wavefrontsize"#Value, + "WavefrontSize", + !cast<string>(Value), + "The number of threads per wavefront" +>; + +def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; +def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; +def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; + +class SubtargetFeatureGeneration <string Value, string Subtarget, + list<SubtargetFeature> Implies> : + SubtargetFeature <Value, "Gen", Subtarget#"::"#Value, + Value#" GPU generation", Implies>; + +def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp", + "DX10Clamp", + "true", + "clamp modifier clamps NaNs to 0.0" +>; + +def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca", + "EnablePromoteAlloca", + "true", + "Enable promote alloca pass" +>; + diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h index 91fe921bfeec..ee836bf8a631 100644 --- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Interface to describe a layout of a stack frame on an AMDGPU target. +/// Interface to describe a layout of a stack frame on an AMDGPU target. // //===----------------------------------------------------------------------===// @@ -19,7 +19,7 @@ namespace llvm { -/// \brief Information about the stack frame layout on the AMDGPU targets. +/// Information about the stack frame layout on the AMDGPU targets. /// /// It holds the direction of the stack growth, the known stack alignment on /// entry to each function, and the offset to the locals area. diff --git a/lib/Target/AMDGPU/AMDGPUGISel.td b/lib/Target/AMDGPU/AMDGPUGISel.td new file mode 100644 index 000000000000..ba735390f679 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUGISel.td @@ -0,0 +1,138 @@ +//===-- AMDGPUGIsel.td - AMDGPU GlobalISel Patterns---------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This files contains patterns that should only be used by GlobalISel. For +// example patterns for V_* instructions that have S_* equivalents. +// SelectionDAG does not support selecting V_* instructions. +//===----------------------------------------------------------------------===// + +include "AMDGPU.td" + +def sd_vsrc0 : ComplexPattern<i32, 1, "">; +def gi_vsrc0 : + GIComplexOperandMatcher<s32, "selectVSRC0">, + GIComplexPatternEquiv<sd_vsrc0>; + +def sd_vcsrc : ComplexPattern<i32, 1, "">; +def gi_vcsrc : + GIComplexOperandMatcher<s32, "selectVCSRC">, + GIComplexPatternEquiv<sd_vcsrc>; + +def gi_vop3mods0 : + GIComplexOperandMatcher<s32, "selectVOP3Mods0">, + GIComplexPatternEquiv<VOP3Mods0>; + +def gi_vop3mods : + GIComplexOperandMatcher<s32, "selectVOP3Mods">, + GIComplexPatternEquiv<VOP3Mods>; + +def gi_vop3omods : + GIComplexOperandMatcher<s32, "selectVOP3OMods">, + GIComplexPatternEquiv<VOP3OMods>; + +class GISelSop2Pat < + SDPatternOperator node, + Instruction inst, + ValueType dst_vt, + ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat < + + (dst_vt (node (src0_vt SReg_32:$src0), (src1_vt SReg_32:$src1))), + (inst src0_vt:$src0, src1_vt:$src1) +>; + +class GISelVop2Pat < + SDPatternOperator node, + Instruction inst, + ValueType dst_vt, + ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat < + + (dst_vt (node (src0_vt (sd_vsrc0 src0_vt:$src0)), (src1_vt VGPR_32:$src1))), + (inst src0_vt:$src0, src1_vt:$src1) +>; + +class GISelVop2CommutePat < + SDPatternOperator node, + Instruction inst, + ValueType dst_vt, + ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat < + + (dst_vt (node (src1_vt VGPR_32:$src1), (src0_vt (sd_vsrc0 src0_vt:$src0)))), + (inst src0_vt:$src0, src1_vt:$src1) +>; + +class GISelVop3Pat2 < + SDPatternOperator node, + Instruction inst, + ValueType dst_vt, + ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat < + + (dst_vt (node (src0_vt (sd_vcsrc src0_vt:$src0)), (src1_vt (sd_vcsrc src1_vt:$src1)))), + (inst src0_vt:$src0, src1_vt:$src1) +>; + +class GISelVop3Pat2CommutePat < + SDPatternOperator node, + Instruction inst, + ValueType dst_vt, + ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat < + + (dst_vt (node (src0_vt (sd_vcsrc src0_vt:$src0)), (src1_vt (sd_vcsrc src1_vt:$src1)))), + (inst src0_vt:$src1, src1_vt:$src0) +>; + +class GISelVop3Pat2ModsPat < + SDPatternOperator node, + Instruction inst, + ValueType dst_vt, + ValueType src0_vt = dst_vt, ValueType src1_vt = src0_vt> : GCNPat < + + (dst_vt (node (src0_vt (VOP3Mods0 src0_vt:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omods)), + (src1_vt (VOP3Mods src1_vt:$src1, i32:$src1_modifiers)))), + (inst i32:$src0_modifiers, src0_vt:$src0, + i32:$src1_modifiers, src1_vt:$src1, $clamp, $omods) +>; + +multiclass GISelVop2IntrPat < + SDPatternOperator node, Instruction inst, + ValueType dst_vt, ValueType src_vt = dst_vt> { + + def : GISelVop2Pat <node, inst, dst_vt, src_vt>; + + // FIXME: Intrinsics aren't marked as commutable, so we need to add an explcit + // pattern to handle commuting. This is another reason why legalizing to a + // generic machine instruction may be better that matching the intrinsic + // directly. + def : GISelVop2CommutePat <node, inst, dst_vt, src_vt>; +} + +def : GISelSop2Pat <or, S_OR_B32, i32>; +def : GISelVop2Pat <or, V_OR_B32_e32, i32>; + +def : GISelSop2Pat <sra, S_ASHR_I32, i32>; +let AddedComplexity = 100 in { +let SubtargetPredicate = isSICI in { +def : GISelVop2Pat <sra, V_ASHR_I32_e32, i32>; +} +def : GISelVop2CommutePat <sra, V_ASHRREV_I32_e32, i32>; +} +def : GISelVop3Pat2CommutePat <sra, V_ASHRREV_I32_e64, i32>; + +// FIXME: Select directly to _e32 so we don't need to deal with modifiers. +// FIXME: We can't re-use SelectionDAG patterns here because they match +// against a custom SDNode and we would need to create a generic machine +// instruction that is equivalent to the custom SDNode. This would also require +// us to custom legalize the intrinsic to the new generic machine instruction, +// but I can't get custom legalizing of intrinsic to work and I'm not sure if +// this is even supported yet. +defm : GISelVop2IntrPat < + int_amdgcn_cvt_pkrtz, V_CVT_PKRTZ_F16_F32_e32, v2f16, f32>; + +defm : GISelVop2IntrPat <int_maxnum, V_MAX_F32_e32, f32>; +def : GISelVop3Pat2ModsPat <int_maxnum, V_MAX_F64, f64>; +defm : GISelVop2IntrPat <int_minnum, V_MIN_F32_e32, f32>; +def : GISelVop3Pat2ModsPat <int_minnum, V_MIN_F64, f64>; diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def index bf7deb500d1a..3a58c6c6a29f 100644 --- a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def +++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def @@ -16,41 +16,89 @@ namespace AMDGPU { enum PartialMappingIdx { None = - 1, - PM_SGPR32 = 0, - PM_SGPR64 = 1, - PM_VGPR32 = 2, - PM_VGPR64 = 3 + PM_SGPR1 = 0, + PM_SGPR16 = 4, + PM_SGPR32 = 5, + PM_SGPR64 = 6, + PM_SGPR128 = 7, + PM_SGPR256 = 8, + PM_SGPR512 = 9, + PM_VGPR1 = 10, + PM_VGPR16 = 14, + PM_VGPR32 = 15, + PM_VGPR64 = 16, + PM_VGPR128 = 17, + PM_VGPR256 = 18, + PM_VGPR512 = 19, + PM_SGPR96 = 20, + PM_VGPR96 = 21 }; const RegisterBankInfo::PartialMapping PartMappings[] { // StartIdx, Length, RegBank + {0, 1, SCCRegBank}, + {0, 16, SGPRRegBank}, {0, 32, SGPRRegBank}, {0, 64, SGPRRegBank}, + {0, 128, SGPRRegBank}, + {0, 256, SGPRRegBank}, + {0, 512, SGPRRegBank}, + {0, 1, SGPRRegBank}, + {0, 16, VGPRRegBank}, {0, 32, VGPRRegBank}, - {0, 64, VGPRRegBank} + {0, 64, VGPRRegBank}, + {0, 128, VGPRRegBank}, + {0, 256, VGPRRegBank}, + {0, 512, VGPRRegBank}, + {0, 96, SGPRRegBank}, + {0, 96, VGPRRegBank}, }; const RegisterBankInfo::ValueMapping ValMappings[] { - // SGPR 32-bit {&PartMappings[0], 1}, - // SGPR 64-bit + {nullptr, 0}, + {nullptr, 0}, + {nullptr, 0}, {&PartMappings[1], 1}, - // VGPR 32-bit {&PartMappings[2], 1}, - // VGPR 64-bit - {&PartMappings[3], 1} + {&PartMappings[3], 1}, + {&PartMappings[4], 1}, + {&PartMappings[5], 1}, + {&PartMappings[6], 1}, + {&PartMappings[7], 1}, + {nullptr, 0}, + {nullptr, 0}, + {nullptr, 0}, + {&PartMappings[8], 1}, + {&PartMappings[9], 1}, + {&PartMappings[10], 1}, + {&PartMappings[11], 1}, + {&PartMappings[12], 1}, + {&PartMappings[13], 1}, + {&PartMappings[14], 1}, + {&PartMappings[15], 1} }; enum ValueMappingIdx { SGPRStartIdx = 0, - VGPRStartIdx = 2 + VGPRStartIdx = 10 }; const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID, unsigned Size) { - assert(Size % 32 == 0); - unsigned Idx = BankID == AMDGPU::SGPRRegBankID ? SGPRStartIdx : VGPRStartIdx; - Idx += (Size / 32) - 1; + unsigned Idx; + switch (Size) { + case 1: + Idx = BankID == AMDGPU::SCCRegBankID ? PM_SGPR1 : PM_VGPR1; + break; + case 96: + Idx = BankID == AMDGPU::SGPRRegBankID ? PM_SGPR96 : PM_VGPR96; + break; + default: + Idx = BankID == AMDGPU::VGPRRegBankID ? VGPRStartIdx : SGPRStartIdx; + Idx += Log2_32_Ceil(Size); + break; + } return &ValMappings[Idx]; } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index 463e700f13b7..01ef346f74ee 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp +++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -8,13 +8,17 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief AMDGPU HSA Metadata Streamer. +/// AMDGPU HSA Metadata Streamer. /// // //===----------------------------------------------------------------------===// #include "AMDGPUHSAMetadataStreamer.h" #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIMachineFunctionInfo.h" +#include "SIProgramInfo.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Module.h" @@ -196,6 +200,57 @@ std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions( return Dims; } +Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps( + const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const { + const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); + const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + HSAMD::Kernel::CodeProps::Metadata HSACodeProps; + const Function &F = MF.getFunction(); + + assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || + F.getCallingConv() == CallingConv::SPIR_KERNEL); + + unsigned MaxKernArgAlign; + HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F, + MaxKernArgAlign); + HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize; + HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize; + HSACodeProps.mKernargSegmentAlign = std::max(MaxKernArgAlign, 4u); + HSACodeProps.mWavefrontSize = STM.getWavefrontSize(); + HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR; + HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR; + HSACodeProps.mMaxFlatWorkGroupSize = MFI.getMaxFlatWorkGroupSize(); + HSACodeProps.mIsDynamicCallStack = ProgramInfo.DynamicCallStack; + HSACodeProps.mIsXNACKEnabled = STM.isXNACKEnabled(); + HSACodeProps.mNumSpilledSGPRs = MFI.getNumSpilledSGPRs(); + HSACodeProps.mNumSpilledVGPRs = MFI.getNumSpilledVGPRs(); + + return HSACodeProps; +} + +Kernel::DebugProps::Metadata MetadataStreamer::getHSADebugProps( + const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const { + const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); + HSAMD::Kernel::DebugProps::Metadata HSADebugProps; + + if (!STM.debuggerSupported()) + return HSADebugProps; + + HSADebugProps.mDebuggerABIVersion.push_back(1); + HSADebugProps.mDebuggerABIVersion.push_back(0); + + if (STM.debuggerEmitPrologue()) { + HSADebugProps.mPrivateSegmentBufferSGPR = + ProgramInfo.DebuggerPrivateSegmentBufferSGPR; + HSADebugProps.mWavefrontPrivateSegmentOffsetSGPR = + ProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; + } + + return HSADebugProps; +} + void MetadataStreamer::emitVersion() { auto &Version = HSAMetadata.mVersion; @@ -255,32 +310,7 @@ void MetadataStreamer::emitKernelArgs(const Function &Func) { for (auto &Arg : Func.args()) emitKernelArg(Arg); - // TODO: What about other languages? - if (!Func.getParent()->getNamedMetadata("opencl.ocl.version")) - return; - - auto &DL = Func.getParent()->getDataLayout(); - auto Int64Ty = Type::getInt64Ty(Func.getContext()); - - emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX); - emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY); - emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ); - - auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(), - AMDGPUASI.GLOBAL_ADDRESS); - auto CallsPrintf = Func.getParent()->getNamedMetadata("llvm.printf.fmts"); - if (CallsPrintf) - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer); - if (Func.hasFnAttribute("calls-enqueue-kernel")) { - if (!CallsPrintf) { - // Emit a dummy argument so that the remaining hidden arguments - // have a fixed position relative to the first hidden argument. - // This is to facilitate library code to access hidden arguments. - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone); - } - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue); - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction); - } + emitHiddenKernelArgs(Func); } void MetadataStreamer::emitKernelArg(const Argument &Arg) { @@ -320,13 +350,26 @@ void MetadataStreamer::emitKernelArg(const Argument &Arg) { if (Node && ArgNo < Node->getNumOperands()) TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString(); - emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(), - getValueKind(Arg.getType(), TypeQual, BaseTypeName), Name, - TypeName, BaseTypeName, AccQual, TypeQual); + Type *Ty = Arg.getType(); + const DataLayout &DL = Func->getParent()->getDataLayout(); + + unsigned PointeeAlign = 0; + if (auto PtrTy = dyn_cast<PointerType>(Ty)) { + if (PtrTy->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) { + PointeeAlign = Arg.getParamAlignment(); + if (PointeeAlign == 0) + PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType()); + } + } + + emitKernelArg(DL, Ty, getValueKind(Arg.getType(), TypeQual, BaseTypeName), + PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual); } void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty, - ValueKind ValueKind, StringRef Name, + ValueKind ValueKind, + unsigned PointeeAlign, + StringRef Name, StringRef TypeName, StringRef BaseTypeName, StringRef AccQual, StringRef TypeQual) { HSAMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata()); @@ -338,12 +381,7 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty, Arg.mAlign = DL.getABITypeAlignment(Ty); Arg.mValueKind = ValueKind; Arg.mValueType = getValueType(Ty, BaseTypeName); - - if (auto PtrTy = dyn_cast<PointerType>(Ty)) { - auto ElTy = PtrTy->getElementType(); - if (PtrTy->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS && ElTy->isSized()) - Arg.mPointeeAlign = DL.getABITypeAlignment(ElTy); - } + Arg.mPointeeAlign = PointeeAlign; if (auto PtrTy = dyn_cast<PointerType>(Ty)) Arg.mAddrSpaceQual = getAddressSpaceQualifer(PtrTy->getAddressSpace()); @@ -366,6 +404,48 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty, } } +void MetadataStreamer::emitHiddenKernelArgs(const Function &Func) { + int HiddenArgNumBytes = + getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0); + + if (!HiddenArgNumBytes) + return; + + auto &DL = Func.getParent()->getDataLayout(); + auto Int64Ty = Type::getInt64Ty(Func.getContext()); + + if (HiddenArgNumBytes >= 8) + emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX); + if (HiddenArgNumBytes >= 16) + emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY); + if (HiddenArgNumBytes >= 24) + emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ); + + auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(), + AMDGPUASI.GLOBAL_ADDRESS); + + // Emit "printf buffer" argument if printf is used, otherwise emit dummy + // "none" argument. + if (HiddenArgNumBytes >= 32) { + if (Func.getParent()->getNamedMetadata("llvm.printf.fmts")) + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer); + else + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone); + } + + // Emit "default queue" and "completion action" arguments if enqueue kernel is + // used, otherwise emit dummy "none" arguments. + if (HiddenArgNumBytes >= 48) { + if (Func.hasFnAttribute("calls-enqueue-kernel")) { + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue); + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction); + } else { + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone); + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone); + } + } +} + void MetadataStreamer::begin(const Module &Mod) { AMDGPUASI = getAMDGPUAS(Mod); emitVersion(); @@ -383,13 +463,14 @@ void MetadataStreamer::end() { verify(HSAMetadataString); } -void MetadataStreamer::emitKernel( - const Function &Func, - const Kernel::CodeProps::Metadata &CodeProps, - const Kernel::DebugProps::Metadata &DebugProps) { +void MetadataStreamer::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) { + auto &Func = MF.getFunction(); if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL) return; + auto CodeProps = getHSACodeProps(MF, ProgramInfo); + auto DebugProps = getHSADebugProps(MF, ProgramInfo); + HSAMetadata.mKernels.push_back(Kernel::Metadata()); auto &Kernel = HSAMetadata.mKernels.back(); diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index bd6515521a74..3424c956d781 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h +++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief AMDGPU HSA Metadata Streamer. +/// AMDGPU HSA Metadata Streamer. /// // //===----------------------------------------------------------------------===// @@ -28,6 +28,7 @@ class DataLayout; class Function; class MDNode; class Module; +struct SIProgramInfo; class Type; namespace AMDGPU { @@ -55,6 +56,13 @@ private: std::vector<uint32_t> getWorkGroupDimensions(MDNode *Node) const; + Kernel::CodeProps::Metadata getHSACodeProps( + const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const; + Kernel::DebugProps::Metadata getHSADebugProps( + const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const; + void emitVersion(); void emitPrintf(const Module &Mod); @@ -68,10 +76,13 @@ private: void emitKernelArg(const Argument &Arg); void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind, + unsigned PointeeAlign = 0, StringRef Name = "", StringRef TypeName = "", StringRef BaseTypeName = "", StringRef AccQual = "", StringRef TypeQual = ""); + void emitHiddenKernelArgs(const Function &Func); + public: MetadataStreamer() = default; ~MetadataStreamer() = default; @@ -84,9 +95,7 @@ public: void end(); - void emitKernel(const Function &Func, - const Kernel::CodeProps::Metadata &CodeProps, - const Kernel::DebugProps::Metadata &DebugProps); + void emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo); }; } // end namespace HSAMD diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index f4776adb069c..f25f4d4693ea 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -8,7 +8,7 @@ //==-----------------------------------------------------------------------===// // /// \file -/// \brief Defines an instruction selector for the AMDGPU target. +/// Defines an instruction selector for the AMDGPU target. // //===----------------------------------------------------------------------===// @@ -16,6 +16,7 @@ #include "AMDGPUArgumentUsageInfo.h" #include "AMDGPUISelLowering.h" // For AMDGPUISD #include "AMDGPUInstrInfo.h" +#include "AMDGPUPerfHintAnalysis.h" #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" @@ -24,15 +25,16 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/CodeGen/SelectionDAGNodes.h" @@ -43,6 +45,7 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include <cassert> #include <cstdint> @@ -68,7 +71,7 @@ namespace { class AMDGPUDAGToDAGISel : public SelectionDAGISel { // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can // make the right decision when generating code for different targets. - const AMDGPUSubtarget *Subtarget; + const GCNSubtarget *Subtarget; AMDGPUAS AMDGPUASI; bool EnableLateStructurizeCFG; @@ -83,6 +86,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AMDGPUArgumentUsageInfo>(); + AU.addRequired<AMDGPUPerfHintAnalysis>(); + AU.addRequired<DivergenceAnalysis>(); SelectionDAGISel::getAnalysisUsage(AU); } @@ -98,20 +103,12 @@ private: std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const; bool isNoNanSrc(SDValue N) const; bool isInlineImmediate(const SDNode *N) const; - bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs, - const R600InstrInfo *TII); - bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &); - bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &); - bool isConstantLoad(const MemSDNode *N, int cbID) const; bool isUniformBr(const SDNode *N) const; SDNode *glueCopyToM0(SDNode *N) const; const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; - bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); - bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, - SDValue& Offset); virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); bool isDSOffsetLegal(const SDValue &Base, unsigned Offset, @@ -162,6 +159,7 @@ private: bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const; + SDValue Expand32BitAddress(SDValue Addr) const; bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, bool &Imm) const; bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; @@ -216,7 +214,7 @@ private: void SelectS_BFE(SDNode *N); bool isCBranchSCC(const SDNode *N) const; void SelectBRCOND(SDNode *N); - void SelectFMAD(SDNode *N); + void SelectFMAD_FMA(SDNode *N); void SelectATOMIC_CMP_SWAP(SDNode *N); protected: @@ -225,9 +223,18 @@ protected: }; class R600DAGToDAGISel : public AMDGPUDAGToDAGISel { + const R600Subtarget *Subtarget; + AMDGPUAS AMDGPUASI; + + bool isConstantLoad(const MemSDNode *N, int cbID) const; + bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); + bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, + SDValue& Offset); public: explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) : - AMDGPUDAGToDAGISel(TM, OptLevel) {} + AMDGPUDAGToDAGISel(TM, OptLevel) { + AMDGPUASI = AMDGPU::getAMDGPUAS(*TM); + } void Select(SDNode *N) override; @@ -235,6 +242,11 @@ public: SDValue &Offset) override; bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset) override; + + bool runOnMachineFunction(MachineFunction &MF) override; +protected: + // Include the pieces autogenerated from the target description. +#include "R600GenDAGISel.inc" }; } // end anonymous namespace @@ -242,17 +254,19 @@ public: INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "isel", "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo) +INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis) +INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "isel", "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) -/// \brief This pass converts a legalized DAG into a AMDGPU-specific +/// This pass converts a legalized DAG into a AMDGPU-specific // DAG, ready for instruction scheduling. FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel) { return new AMDGPUDAGToDAGISel(TM, OptLevel); } -/// \brief This pass converts a legalized DAG into a R600-specific +/// This pass converts a legalized DAG into a R600-specific // DAG, ready for instruction scheduling. FunctionPass *llvm::createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel) { @@ -260,7 +274,7 @@ FunctionPass *llvm::createR600ISelDag(TargetMachine *TM, } bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { - Subtarget = &MF.getSubtarget<AMDGPUSubtarget>(); + Subtarget = &MF.getSubtarget<GCNSubtarget>(); return SelectionDAGISel::runOnMachineFunction(MF); } @@ -276,8 +290,7 @@ bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const { } bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const { - const SIInstrInfo *TII - = static_cast<const SISubtarget *>(Subtarget)->getInstrInfo(); + const SIInstrInfo *TII = Subtarget->getInstrInfo(); if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) return TII->isInlineConstant(C->getAPIntValue()); @@ -288,7 +301,7 @@ bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const { return false; } -/// \brief Determine the register class for \p OpNo +/// Determine the register class for \p OpNo /// \returns The register class of the virtual register that will be used for /// the given operand number \OpNo or NULL if the register class cannot be /// determined. @@ -303,7 +316,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, } const SIRegisterInfo *TRI - = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo(); + = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo(); return TRI->getPhysRegClass(Reg); } @@ -394,7 +407,6 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); - const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo(); SDLoc DL(N); SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); @@ -420,10 +432,9 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { IsRegSeq = false; break; } + unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i); RegSeqArgs[1 + (2 * i)] = N->getOperand(i); - RegSeqArgs[1 + (2 * i) + 1] = - CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, - MVT::i32); + RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32); } if (NOps != NumVectorElts) { // Fill in the missing undef elements if this was a scalar_to_vector. @@ -431,9 +442,10 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, EltVT); for (unsigned i = NOps; i < NumVectorElts; ++i) { + unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i); RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); RegSeqArgs[1 + (2 * i) + 1] = - CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32); + CurDAG->getTargetConstant(Sub, DL, MVT::i32); } } @@ -450,7 +462,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { } if (isa<AtomicSDNode>(N) || - (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC)) + (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC || + Opc == AMDGPUISD::ATOMIC_LOAD_FADD || + Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || + Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) N = glueCopyToM0(N); switch (Opc) { @@ -487,9 +502,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::BUILD_VECTOR: { EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); - - if (VT == MVT::v2i16 || VT == MVT::v2f16) { - if (Opc == ISD::BUILD_VECTOR) { + if (VT.getScalarSizeInBits() == 16) { + if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) { uint32_t LHSVal, RHSVal; if (getConstantValue(N->getOperand(0), LHSVal) && getConstantValue(N->getOperand(1), RHSVal)) { @@ -559,7 +573,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { return; } case ISD::LOAD: - case ISD::STORE: { + case ISD::STORE: + case ISD::ATOMIC_LOAD: + case ISD::ATOMIC_STORE: { N = glueCopyToM0(N); break; } @@ -619,7 +635,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectBRCOND(N); return; case ISD::FMAD: - SelectFMAD(N); + case ISD::FMA: + SelectFMAD_FMA(N); return; case AMDGPUISD::ATOMIC_CMP_SWAP: SelectATOMIC_CMP_SWAP(N); @@ -629,15 +646,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectCode(N); } -bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const { - if (!N->readMem()) - return false; - if (CbId == -1) - return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS; - - return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId; -} - bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const { const BasicBlock *BB = FuncInfo->MBB->getBasicBlock(); const Instruction *Term = BB->getTerminator(); @@ -653,26 +661,6 @@ StringRef AMDGPUDAGToDAGISel::getPassName() const { // Complex Patterns //===----------------------------------------------------------------------===// -bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, - SDValue& IntPtr) { - if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) { - IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr), - true); - return true; - } - return false; -} - -bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, - SDValue& BaseReg, SDValue &Offset) { - if (!isa<ConstantSDNode>(Addr)) { - BaseReg = Addr; - Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true); - return true; - } - return false; -} - bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset) { return false; @@ -684,11 +672,11 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, SDLoc DL(Addr); if ((C = dyn_cast<ConstantSDNode>(Addr))) { - Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) && (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) { - Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) { @@ -759,12 +747,11 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { if (ProduceCarry) { // Replace the carry-use - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(AddHi, 1)); + ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1)); } // Replace the remaining uses. - CurDAG->ReplaceAllUsesWith(N, RegSequence); - CurDAG->RemoveDeadNode(N); + ReplaceNode(N, RegSequence); } void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { @@ -1410,7 +1397,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, return false; SDLoc SL(ByteOffsetNode); - AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration(); + GCNSubtarget::Generation Gen = Subtarget->getGeneration(); int64_t ByteOffset = C->getSExtValue(); int64_t EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset); @@ -1435,19 +1422,45 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, return true; } +SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { + if (Addr.getValueType() != MVT::i32) + return Addr; + + // Zero-extend a 32-bit address. + SDLoc SL(Addr); + + const MachineFunction &MF = CurDAG->getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + unsigned AddrHiVal = Info->get32BitAddressHighBits(); + SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32); + + const SDValue Ops[] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32), + Addr, + CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32), + SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi), + 0), + CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32), + }; + + return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64, + Ops), 0); +} + bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, bool &Imm) const { SDLoc SL(Addr); + if (CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); if (SelectSMRDOffset(N1, Offset, Imm)) { - SBase = N0; + SBase = Expand32BitAddress(N0); return true; } } - SBase = Addr; + SBase = Expand32BitAddress(Addr); Offset = CurDAG->getTargetConstant(0, SL, MVT::i32); Imm = true; return true; @@ -1651,7 +1664,7 @@ bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const { return true; if (VT == MVT::i64) { - auto ST = static_cast<const SISubtarget *>(Subtarget); + auto ST = static_cast<const GCNSubtarget *>(Subtarget); ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64(); @@ -1674,15 +1687,39 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { unsigned CondReg = UseSCCBr ? AMDGPU::SCC : AMDGPU::VCC; SDLoc SL(N); + if (!UseSCCBr) { + // This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not + // analyzed what generates the vcc value, so we do not know whether vcc + // bits for disabled lanes are 0. Thus we need to mask out bits for + // disabled lanes. + // + // For the case that we select S_CBRANCH_SCC1 and it gets + // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls + // SIInstrInfo::moveToVALU which inserts the S_AND). + // + // We could add an analysis of what generates the vcc value here and omit + // the S_AND when is unnecessary. But it would be better to add a separate + // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it + // catches both cases. + Cond = SDValue(CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1, + CurDAG->getRegister(AMDGPU::EXEC, MVT::i1), + Cond), + 0); + } + SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond); CurDAG->SelectNodeTo(N, BrOp, MVT::Other, N->getOperand(2), // Basic Block VCC.getValue(0)); } -void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) { +void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) { MVT VT = N->getSimpleValueType(0); - if (VT != MVT::f32 || !Subtarget->hasMadMixInsts()) { + bool IsFMA = N->getOpcode() == ISD::FMA; + if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() && + !Subtarget->hasFmaMixInsts()) || + ((IsFMA && Subtarget->hasMadMixInsts()) || + (!IsFMA && Subtarget->hasFmaMixInsts()))) { SelectCode(N); return; } @@ -1692,13 +1729,13 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) { SDValue Src2 = N->getOperand(2); unsigned Src0Mods, Src1Mods, Src2Mods; - // Avoid using v_mad_mix_f32 unless there is actually an operand using the - // conversion from f16. + // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand + // using the conversion from f16. bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods); bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods); bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods); - assert(!Subtarget->hasFP32Denormals() && + assert((IsFMA || !Subtarget->hasFP32Denormals()) && "fmad selected with denormals enabled"); // TODO: We can select this with f32 denormals enabled if all the sources are // converted from f16 (in which case fmad isn't legal). @@ -1714,7 +1751,9 @@ void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) { Zero, Zero }; - CurDAG->SelectNodeTo(N, AMDGPU::V_MAD_MIX_F32, MVT::f32, Ops); + CurDAG->SelectNodeTo(N, + IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32, + MVT::f32, Ops); } else { SelectCode(N); } @@ -2100,6 +2139,41 @@ void AMDGPUDAGToDAGISel::PostprocessISelDAG() { } while (IsModified); } +bool R600DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { + Subtarget = &MF.getSubtarget<R600Subtarget>(); + return SelectionDAGISel::runOnMachineFunction(MF); +} + +bool R600DAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const { + if (!N->readMem()) + return false; + if (CbId == -1) + return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT; + + return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId; +} + +bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, + SDValue& IntPtr) { + if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) { + IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr), + true); + return true; + } + return false; +} + +bool R600DAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, + SDValue& BaseReg, SDValue &Offset) { + if (!isa<ConstantSDNode>(Addr)) { + BaseReg = Addr; + Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true); + return true; + } + return false; +} + void R600DAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { @@ -2120,12 +2194,12 @@ void R600DAGToDAGISel::Select(SDNode *N) { // pass. We want to avoid 128 bits copies as much as possible because they // can't be bundled by our scheduler. switch(NumVectorElts) { - case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break; + case 2: RegClassID = R600::R600_Reg64RegClassID; break; case 4: if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR) - RegClassID = AMDGPU::R600_Reg128VerticalRegClassID; + RegClassID = R600::R600_Reg128VerticalRegClassID; else - RegClassID = AMDGPU::R600_Reg128RegClassID; + RegClassID = R600::R600_Reg128RegClassID; break; default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); } @@ -2143,11 +2217,11 @@ bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, SDLoc DL(Addr); if ((C = dyn_cast<ConstantSDNode>(Addr))) { - Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) && (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) { - Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) { @@ -2178,7 +2252,7 @@ bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, && isInt<16>(IMMOffset->getZExtValue())) { Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(CurDAG->getEntryNode()), - AMDGPU::ZERO, MVT::i32); + R600::ZERO, MVT::i32); Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), MVT::i32); return true; diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 49929441ef21..b201126c593b 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief This is the parent TargetLowering class for hardware code gen +/// This is the parent TargetLowering class for hardware code gen /// targets. // //===----------------------------------------------------------------------===// @@ -25,9 +25,12 @@ #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" +#include "Utils/AMDGPUBaseInfo.h" #include "R600MachineFunctionInfo.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -38,18 +41,6 @@ #include "llvm/Support/KnownBits.h" using namespace llvm; -static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - MachineFunction &MF = State.getMachineFunction(); - AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); - - uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(), - ArgFlags.getOrigAlign()); - State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo)); - return true; -} - static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State, @@ -71,7 +62,9 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, case MVT::i64: case MVT::f64: case MVT::v2i32: - case MVT::v2f32: { + case MVT::v2f32: + case MVT::v4i16: + case MVT::v4f16: { // Up to SGPR0-SGPR39 return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, &AMDGPU::SGPR_64RegClass, 20); @@ -92,7 +85,9 @@ static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT, case MVT::i64: case MVT::f64: case MVT::v2i32: - case MVT::v2f32: { + case MVT::v2f32: + case MVT::v4i16: + case MVT::v4f16: { return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, &AMDGPU::VReg_64RegClass, 31); } @@ -324,10 +319,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FLOG, MVT::f32, Custom); setOperationAction(ISD::FLOG10, MVT::f32, Custom); - if (Subtarget->has16BitInsts()) { - setOperationAction(ISD::FLOG, MVT::f16, Custom); - setOperationAction(ISD::FLOG10, MVT::f16, Custom); - } setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); @@ -335,10 +326,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FREM, MVT::f32, Custom); setOperationAction(ISD::FREM, MVT::f64, Custom); - // v_mad_f32 does not support denormals according to some sources. - if (!Subtarget->hasFP32Denormals()) - setOperationAction(ISD::FMAD, MVT::f32, Legal); - // Expand to fneg + fadd. setOperationAction(ISD::FSUB, MVT::f64, Expand); @@ -353,19 +340,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); - if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { - setOperationAction(ISD::FCEIL, MVT::f64, Custom); - setOperationAction(ISD::FTRUNC, MVT::f64, Custom); - setOperationAction(ISD::FRINT, MVT::f64, Custom); - setOperationAction(ISD::FFLOOR, MVT::f64, Custom); - } - - if (!Subtarget->hasBFI()) { - // fcopysign can be done in a single instruction with BFI. - setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); - } - setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom); @@ -389,13 +363,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); - } - - if (!Subtarget->hasBCNT(32)) - setOperationAction(ISD::CTPOP, MVT::i32, Expand); - if (!Subtarget->hasBCNT(64)) - setOperationAction(ISD::CTPOP, MVT::i64, Expand); + // AMDGPU uses ADDC/SUBC/ADDE/SUBE + setOperationAction(ISD::ADDC, VT, Legal); + setOperationAction(ISD::SUBC, VT, Legal); + setOperationAction(ISD::ADDE, VT, Legal); + setOperationAction(ISD::SUBE, VT, Legal); + } // The hardware supports 32-bit ROTR, but not ROTL. setOperationAction(ISD::ROTL, MVT::i32, Expand); @@ -416,28 +390,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SMAX, MVT::i32, Legal); setOperationAction(ISD::UMAX, MVT::i32, Legal); - if (Subtarget->hasFFBH()) - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); - - if (Subtarget->hasFFBL()) - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom); - setOperationAction(ISD::CTTZ, MVT::i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom); setOperationAction(ISD::CTLZ, MVT::i64, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); - // We only really have 32-bit BFE instructions (and 16-bit on VI). - // - // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any - // effort to match them now. We want this to be false for i64 cases when the - // extraction isn't restricted to the upper or lower half. Ideally we would - // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that - // span the midpoint are probably relatively rare, so don't worry about them - // for now. - if (Subtarget->hasBFE()) - setHasExtractBitsInsn(true); - static const MVT::SimpleValueType VectorIntTypes[] = { MVT::v2i32, MVT::v4i32 }; @@ -468,10 +425,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Custom); setOperationAction(ISD::UDIVREM, VT, Expand); - setOperationAction(ISD::ADDC, VT, Expand); - setOperationAction(ISD::SUBC, VT, Expand); - setOperationAction(ISD::ADDE, VT, Expand); - setOperationAction(ISD::SUBE, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); @@ -546,11 +499,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, // vector compares until that is fixed. setHasMultipleConditionRegisters(true); - // SI at least has hardware support for floating point exceptions, but no way - // of using or handling them is implemented. They are also optional in OpenCL - // (Section 7.3) - setHasFloatingPointExceptions(Subtarget->hasFPExceptions()); - PredictableSelectIsExpensive = false; // We want to find all load dependencies for long chains of stores to enable @@ -573,6 +521,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::MULHU); setTargetDAGCombine(ISD::MULHS); @@ -607,6 +556,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) { case ISD::FNEARBYINT: case AMDGPUISD::RCP: case AMDGPUISD::RCP_LEGACY: + case AMDGPUISD::RCP_IFLAG: case AMDGPUISD::SIN_HW: case AMDGPUISD::FMUL_LEGACY: case AMDGPUISD::FMIN_LEGACY: @@ -748,6 +698,37 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { return true; } +bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const { + switch (N->getOpcode()) { + default: + return false; + case ISD::EntryToken: + case ISD::TokenFactor: + return true; + case ISD::INTRINSIC_WO_CHAIN: + { + unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + switch (IntrID) { + default: + return false; + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: + return true; + } + } + break; + case ISD::LOAD: + { + const LoadSDNode * L = dyn_cast<LoadSDNode>(N); + if (L->getMemOperand()->getAddrSpace() + == AMDGPUASI.CONSTANT_ADDRESS_32BIT) + return true; + return false; + } + break; + } +} + //===---------------------------------------------------------------------===// // Target Properties //===---------------------------------------------------------------------===// @@ -832,17 +813,6 @@ bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return isZExtFree(Val.getValueType(), VT2); } -// v_mad_mix* support a conversion from f16 to f32. -// -// There is only one special case when denormals are enabled we don't currently, -// where this is OK to use. -bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode, - EVT DestVT, EVT SrcVT) const { - return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() && - DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() && - SrcVT.getScalarType() == MVT::f16; -} - bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { // There aren't really 64-bit registers, but pairs of 32-bit ones and only a // limited number of native 64-bit operations. Shrinking an operation to fit @@ -862,7 +832,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, switch (CC) { case CallingConv::AMDGPU_KERNEL: case CallingConv::SPIR_KERNEL: - return CC_AMDGPU_Kernel; + llvm_unreachable("kernels should not be handled here"); case CallingConv::AMDGPU_VS: case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: @@ -885,7 +855,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, switch (CC) { case CallingConv::AMDGPU_KERNEL: case CallingConv::SPIR_KERNEL: - return CC_AMDGPU_Kernel; + llvm_unreachable("kernels should not be handled here"); case CallingConv::AMDGPU_VS: case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: @@ -929,74 +899,118 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, /// for each individual part is i8. We pass the memory type as LocVT to the /// calling convention analysis function and the register type (Ins[x].VT) as /// the ValVT. -void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State, - const SmallVectorImpl<ISD::InputArg> &Ins) const { - for (unsigned i = 0, e = Ins.size(); i != e; ++i) { - const ISD::InputArg &In = Ins[i]; - EVT MemVT; - - unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT); - - if (!Subtarget->isAmdHsaOS() && - (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) { - // The ABI says the caller will extend these values to 32-bits. - MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32; - } else if (NumRegs == 1) { - // This argument is not split, so the IR type is the memory type. - assert(!In.Flags.isSplit()); - if (In.ArgVT.isExtended()) { - // We have an extended type, like i24, so we should just use the register type - MemVT = In.VT; - } else { - MemVT = In.ArgVT; - } - } else if (In.ArgVT.isVector() && In.VT.isVector() && - In.ArgVT.getScalarType() == In.VT.getScalarType()) { - assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements()); - // We have a vector value which has been split into a vector with - // the same scalar type, but fewer elements. This should handle - // all the floating-point vector types. - MemVT = In.VT; - } else if (In.ArgVT.isVector() && - In.ArgVT.getVectorNumElements() == NumRegs) { - // This arg has been split so that each element is stored in a separate - // register. - MemVT = In.ArgVT.getScalarType(); - } else if (In.ArgVT.isExtended()) { - // We have an extended type, like i65. - MemVT = In.VT; - } else { - unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs; - assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0); - if (In.VT.isInteger()) { - MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits); - } else if (In.VT.isVector()) { - assert(!In.VT.getScalarType().isFloatingPoint()); - unsigned NumElements = In.VT.getVectorNumElements(); - assert(MemoryBits % NumElements == 0); - // This vector type has been split into another vector type with - // a different elements size. - EVT ScalarVT = EVT::getIntegerVT(State.getContext(), - MemoryBits / NumElements); - MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements); +void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( + CCState &State, + const SmallVectorImpl<ISD::InputArg> &Ins) const { + const MachineFunction &MF = State.getMachineFunction(); + const Function &Fn = MF.getFunction(); + LLVMContext &Ctx = Fn.getParent()->getContext(); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); + const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn); + + unsigned MaxAlign = 1; + uint64_t ExplicitArgOffset = 0; + const DataLayout &DL = Fn.getParent()->getDataLayout(); + + unsigned InIndex = 0; + + for (const Argument &Arg : Fn.args()) { + Type *BaseArgTy = Arg.getType(); + unsigned Align = DL.getABITypeAlignment(BaseArgTy); + MaxAlign = std::max(Align, MaxAlign); + unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy); + + uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset; + ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize; + + // We're basically throwing away everything passed into us and starting over + // to get accurate in-memory offsets. The "PartOffset" is completely useless + // to us as computed in Ins. + // + // We also need to figure out what type legalization is trying to do to get + // the correct memory offsets. + + SmallVector<EVT, 16> ValueVTs; + SmallVector<uint64_t, 16> Offsets; + ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset); + + for (unsigned Value = 0, NumValues = ValueVTs.size(); + Value != NumValues; ++Value) { + uint64_t BasePartOffset = Offsets[Value]; + + EVT ArgVT = ValueVTs[Value]; + EVT MemVT = ArgVT; + MVT RegisterVT = + getRegisterTypeForCallingConv(Ctx, ArgVT); + unsigned NumRegs = + getNumRegistersForCallingConv(Ctx, ArgVT); + + if (!Subtarget->isAmdHsaOS() && + (ArgVT == MVT::i16 || ArgVT == MVT::i8 || ArgVT == MVT::f16)) { + // The ABI says the caller will extend these values to 32-bits. + MemVT = ArgVT.isInteger() ? MVT::i32 : MVT::f32; + } else if (NumRegs == 1) { + // This argument is not split, so the IR type is the memory type. + if (ArgVT.isExtended()) { + // We have an extended type, like i24, so we should just use the + // register type. + MemVT = RegisterVT; + } else { + MemVT = ArgVT; + } + } else if (ArgVT.isVector() && RegisterVT.isVector() && + ArgVT.getScalarType() == RegisterVT.getScalarType()) { + assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements()); + // We have a vector value which has been split into a vector with + // the same scalar type, but fewer elements. This should handle + // all the floating-point vector types. + MemVT = RegisterVT; + } else if (ArgVT.isVector() && + ArgVT.getVectorNumElements() == NumRegs) { + // This arg has been split so that each element is stored in a separate + // register. + MemVT = ArgVT.getScalarType(); + } else if (ArgVT.isExtended()) { + // We have an extended type, like i65. + MemVT = RegisterVT; } else { - llvm_unreachable("cannot deduce memory type."); + unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs; + assert(ArgVT.getStoreSizeInBits() % NumRegs == 0); + if (RegisterVT.isInteger()) { + MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits); + } else if (RegisterVT.isVector()) { + assert(!RegisterVT.getScalarType().isFloatingPoint()); + unsigned NumElements = RegisterVT.getVectorNumElements(); + assert(MemoryBits % NumElements == 0); + // This vector type has been split into another vector type with + // a different elements size. + EVT ScalarVT = EVT::getIntegerVT(State.getContext(), + MemoryBits / NumElements); + MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements); + } else { + llvm_unreachable("cannot deduce memory type."); + } } - } - // Convert one element vectors to scalar. - if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) - MemVT = MemVT.getScalarType(); + // Convert one element vectors to scalar. + if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) + MemVT = MemVT.getScalarType(); - if (MemVT.isExtended()) { - // This should really only happen if we have vec3 arguments - assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3); - MemVT = MemVT.getPow2VectorType(State.getContext()); - } + if (MemVT.isExtended()) { + // This should really only happen if we have vec3 arguments + assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3); + MemVT = MemVT.getPow2VectorType(State.getContext()); + } - assert(MemVT.isSimple()); - allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags, - State); + unsigned PartOffset = 0; + for (unsigned i = 0; i != NumRegs; ++i) { + State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT, + BasePartOffset + PartOffset, + MemVT.getSimpleVT(), + CCValAssign::Full)); + PartOffset += MemVT.getStoreSize(); + } + } } } @@ -1178,7 +1192,15 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = G->getGlobal(); - if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) { + if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS || + G->getAddressSpace() == AMDGPUASI.REGION_ADDRESS) { + if (!MFI->isEntryFunction()) { + const Function &Fn = DAG.getMachineFunction().getFunction(); + DiagnosticInfoUnsupported BadLDSDecl( + Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc()); + DAG.getContext()->diagnose(BadLDSDecl); + } + // XXX: What does the value of G->getOffset() mean? assert(G->getOffset() == 0 && "Do not know what to do with an non-zero offset"); @@ -1201,6 +1223,16 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { SmallVector<SDValue, 8> Args; + EVT VT = Op.getValueType(); + if (VT == MVT::v4i16 || VT == MVT::v4f16) { + SDLoc SL(Op); + SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0)); + SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1)); + + SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi }); + return DAG.getNode(ISD::BITCAST, SL, VT, BV); + } + for (const SDUse &U : Op->ops()) DAG.ExtractVectorElements(U.get(), Args); @@ -1219,7 +1251,7 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args); } -/// \brief Generate Min/Max node +/// Generate Min/Max node SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, @@ -1985,7 +2017,7 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32); SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); - // Extend back to to 64-bits. + // Extend back to 64-bits. SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit}); SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); @@ -2806,28 +2838,6 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, SN->getBasePtr(), SN->getMemOperand()); } -SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); - if (!CSrc) - return SDValue(); - - const APFloat &F = CSrc->getValueAPF(); - APFloat Zero = APFloat::getZero(F.getSemantics()); - APFloat::cmpResult Cmp0 = F.compare(Zero); - if (Cmp0 == APFloat::cmpLessThan || - (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) { - return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); - } - - APFloat One(F.getSemantics(), "1.0"); - APFloat::cmpResult Cmp1 = F.compare(One); - if (Cmp1 == APFloat::cmpGreaterThan) - return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0)); - - return SDValue(CSrc, 0); -} - // FIXME: This should go in generic DAG combiner with an isTruncateFree check, // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU // issues. @@ -2903,7 +2913,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, SDValue X = LHS->getOperand(0); if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 && - isTypeLegal(MVT::v2i16)) { + isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) { // Prefer build_vector as the canonical form if packed types are legal. // (shl ([asz]ext i16:x), 16 -> build_vector 0, x SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL, @@ -3017,6 +3027,92 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair); } +SDValue AMDGPUTargetLowering::performTruncateCombine( + SDNode *N, DAGCombinerInfo &DCI) const { + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + SDValue Src = N->getOperand(0); + + // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x) + if (Src.getOpcode() == ISD::BITCAST) { + SDValue Vec = Src.getOperand(0); + if (Vec.getOpcode() == ISD::BUILD_VECTOR) { + SDValue Elt0 = Vec.getOperand(0); + EVT EltVT = Elt0.getValueType(); + if (VT.getSizeInBits() <= EltVT.getSizeInBits()) { + if (EltVT.isFloatingPoint()) { + Elt0 = DAG.getNode(ISD::BITCAST, SL, + EltVT.changeTypeToInteger(), Elt0); + } + + return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0); + } + } + } + + // Equivalent of above for accessing the high element of a vector as an + // integer operation. + // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y) + if (Src.getOpcode() == ISD::SRL && !VT.isVector()) { + if (auto K = isConstOrConstSplat(Src.getOperand(1))) { + if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) { + SDValue BV = stripBitcast(Src.getOperand(0)); + if (BV.getOpcode() == ISD::BUILD_VECTOR && + BV.getValueType().getVectorNumElements() == 2) { + SDValue SrcElt = BV.getOperand(1); + EVT SrcEltVT = SrcElt.getValueType(); + if (SrcEltVT.isFloatingPoint()) { + SrcElt = DAG.getNode(ISD::BITCAST, SL, + SrcEltVT.changeTypeToInteger(), SrcElt); + } + + return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt); + } + } + } + } + + // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit. + // + // i16 (trunc (srl i64:x, K)), K <= 16 -> + // i16 (trunc (srl (i32 (trunc x), K))) + if (VT.getScalarSizeInBits() < 32) { + EVT SrcVT = Src.getValueType(); + if (SrcVT.getScalarSizeInBits() > 32 && + (Src.getOpcode() == ISD::SRL || + Src.getOpcode() == ISD::SRA || + Src.getOpcode() == ISD::SHL)) { + SDValue Amt = Src.getOperand(1); + KnownBits Known; + DAG.computeKnownBits(Amt, Known); + unsigned Size = VT.getScalarSizeInBits(); + if ((Known.isConstant() && Known.getConstant().ule(Size)) || + (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) { + EVT MidVT = VT.isVector() ? + EVT::getVectorVT(*DAG.getContext(), MVT::i32, + VT.getVectorNumElements()) : MVT::i32; + + EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout()); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT, + Src.getOperand(0)); + DCI.AddToWorklist(Trunc.getNode()); + + if (Amt.getValueType() != NewShiftVT) { + Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT); + DCI.AddToWorklist(Amt.getNode()); + } + + SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT, + Trunc, Amt); + return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift); + } + } + } + + return SDValue(); +} + // We need to specifically handle i64 mul here to avoid unnecessary conversion // instructions. If we only match on the legalized i64 mul expansion, // SimplifyDemandedBits will be unable to remove them because there will be @@ -3058,6 +3154,17 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + + // SimplifyDemandedBits has the annoying habit of turning useful zero_extends + // in the source into any_extends if the result of the mul is truncated. Since + // we can assume the high bits are whatever we want, use the underlying value + // to avoid the unknown high bits from interfering. + if (N0.getOpcode() == ISD::ANY_EXTEND) + N0 = N0.getOperand(0); + + if (N1.getOpcode() == ISD::ANY_EXTEND) + N1 = N1.getOperand(0); + SDValue Mul; if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { @@ -3495,6 +3602,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, case ISD::FSIN: case AMDGPUISD::RCP: case AMDGPUISD::RCP_LEGACY: + case AMDGPUISD::RCP_IFLAG: case AMDGPUISD::SIN_HW: { SDValue CvtSrc = N0.getOperand(0); if (CvtSrc.getOpcode() == ISD::FNEG) { @@ -3571,6 +3679,18 @@ SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N, } } +SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); + if (!CFP) + return SDValue(); + + // XXX - Should this flush denormals? + const APFloat &Val = CFP->getValueAPF(); + APFloat One(Val.getSemantics(), "1.0"); + return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0)); +} + SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -3617,12 +3737,13 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, // TODO: Generalize and move to DAGCombiner SDValue Src = N->getOperand(0); if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) { - assert(Src.getValueType() == MVT::i64); - SDLoc SL(N); - uint64_t CVal = C->getZExtValue(); - return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT, - DAG.getConstant(Lo_32(CVal), SL, MVT::i32), - DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); + if (Src.getValueType() == MVT::i64) { + SDLoc SL(N); + uint64_t CVal = C->getZExtValue(); + return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT, + DAG.getConstant(Lo_32(CVal), SL, MVT::i32), + DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); + } } if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) { @@ -3656,6 +3777,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performSraCombine(N, DCI); } + case ISD::TRUNCATE: + return performTruncateCombine(N, DCI); case ISD::MUL: return performMulCombine(N, DCI); case ISD::MULHS: @@ -3768,18 +3891,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return performLoadCombine(N, DCI); case ISD::STORE: return performStoreCombine(N, DCI); - case AMDGPUISD::CLAMP: - return performClampCombine(N, DCI); - case AMDGPUISD::RCP: { - if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) { - // XXX - Should this flush denormals? - const APFloat &Val = CFP->getValueAPF(); - APFloat One(Val.getSemantics(), "1.0"); - return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0)); - } - - break; - } + case AMDGPUISD::RCP: + case AMDGPUISD::RCP_IFLAG: + return performRcpCombine(N, DCI); case ISD::AssertZext: case ISD::AssertSext: return performAssertSZExtCombine(N, DCI); @@ -3856,9 +3970,14 @@ SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG, } uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( - const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const { - unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr(); - uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment); + const MachineFunction &MF, const ImplicitParameter Param) const { + const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); + const AMDGPUSubtarget &ST = + AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction()); + unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction()); + unsigned Alignment = ST.getAlignmentForImplicitArgPtr(); + uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) + + ExplicitArgOffset; switch (Param) { case GRID_DIM: return ArgOffset; @@ -3907,6 +4026,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FMED3) NODE_NAME_CASE(SMED3) NODE_NAME_CASE(UMED3) + NODE_NAME_CASE(FDOT2) NODE_NAME_CASE(URECIP) NODE_NAME_CASE(DIV_SCALE) NODE_NAME_CASE(DIV_FMAS) @@ -3917,6 +4037,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(RSQ) NODE_NAME_CASE(RCP_LEGACY) NODE_NAME_CASE(RSQ_LEGACY) + NODE_NAME_CASE(RCP_IFLAG) NODE_NAME_CASE(FMUL_LEGACY) NODE_NAME_CASE(RSQ_CLAMP) NODE_NAME_CASE(LDEXP) @@ -3941,6 +4062,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(MAD_I24) NODE_NAME_CASE(MAD_I64_I32) NODE_NAME_CASE(MAD_U64_U32) + NODE_NAME_CASE(PERM) NODE_NAME_CASE(TEXTURE_FETCH) NODE_NAME_CASE(EXPORT) NODE_NAME_CASE(EXPORT_DONE) @@ -3957,6 +4079,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CVT_F32_UBYTE2) NODE_NAME_CASE(CVT_F32_UBYTE3) NODE_NAME_CASE(CVT_PKRTZ_F16_F32) + NODE_NAME_CASE(CVT_PKNORM_I16_F32) + NODE_NAME_CASE(CVT_PKNORM_U16_F32) + NODE_NAME_CASE(CVT_PK_I16_I32) + NODE_NAME_CASE(CVT_PK_U16_U32) NODE_NAME_CASE(FP_TO_FP16) NODE_NAME_CASE(FP16_ZEXT) NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) @@ -3976,14 +4102,21 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(LOAD_CONSTANT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3) + NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16) NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) + NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(ATOMIC_CMP_SWAP) NODE_NAME_CASE(ATOMIC_INC) NODE_NAME_CASE(ATOMIC_DEC) + NODE_NAME_CASE(ATOMIC_LOAD_FADD) + NODE_NAME_CASE(ATOMIC_LOAD_FMIN) + NODE_NAME_CASE(ATOMIC_LOAD_FMAX) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) + NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(BUFFER_STORE) NODE_NAME_CASE(BUFFER_STORE_FORMAT) + NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) NODE_NAME_CASE(BUFFER_ATOMIC_ADD) NODE_NAME_CASE(BUFFER_ATOMIC_SUB) @@ -3995,6 +4128,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_OR) NODE_NAME_CASE(BUFFER_ATOMIC_XOR) NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) + case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; @@ -4108,14 +4242,45 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( Known.Zero.setHighBits(32 - MaxValBits); break; } + case AMDGPUISD::PERM: { + ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2)); + if (!CMask) + return; + + KnownBits LHSKnown, RHSKnown; + DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1); + DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1); + unsigned Sel = CMask->getZExtValue(); + + for (unsigned I = 0; I < 32; I += 8) { + unsigned SelBits = Sel & 0xff; + if (SelBits < 4) { + SelBits *= 8; + Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I; + Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I; + } else if (SelBits < 7) { + SelBits = (SelBits & 3) * 8; + Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I; + Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I; + } else if (SelBits == 0x0c) { + Known.Zero |= 0xff << I; + } else if (SelBits > 0x0c) { + Known.One |= 0xff << I; + } + Sel >>= 8; + } + break; + } case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); switch (IID) { case Intrinsic::amdgcn_mbcnt_lo: case Intrinsic::amdgcn_mbcnt_hi: { + const GCNSubtarget &ST = + DAG.getMachineFunction().getSubtarget<GCNSubtarget>(); // These return at most the wavefront size - 1. unsigned Size = Op.getValueType().getSizeInBits(); - Known.Zero.setHighBits(Size - Subtarget->getWavefrontSizeLog2()); + Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2()); break; } default: diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 5c31bddd9b1a..a4c3b413e103 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Interface definition of the TargetLowering class that is common +/// Interface definition of the TargetLowering class that is common /// to all AMD GPUs. // //===----------------------------------------------------------------------===// @@ -28,6 +28,8 @@ struct ArgDescriptor; class AMDGPUTargetLowering : public TargetLowering { private: + const AMDGPUSubtarget *Subtarget; + /// \returns AMDGPUISD::FFBH_U32 node if the incoming \p Op may have been /// legalized from a smaller type VT. Need to match pre-legalized type because /// the generic legalization inserts the add/sub between the select and @@ -39,12 +41,11 @@ public: static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG); protected: - const AMDGPUSubtarget *Subtarget; AMDGPUAS AMDGPUASI; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; - /// \brief Split a vector store into multiple scalar stores. + /// Split a vector store into multiple scalar stores. /// \returns The resulting chain. SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const; @@ -78,7 +79,6 @@ protected: bool shouldCombineMemoryType(EVT VT) const; SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, @@ -87,6 +87,7 @@ protected: SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const; @@ -96,6 +97,7 @@ protected: SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const; static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); @@ -108,10 +110,10 @@ protected: SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const; SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const; - /// \brief Split a vector load into 2 loads of half the vector. + /// Split a vector load into 2 loads of half the vector. SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const; - /// \brief Split a vector store into 2 stores of half the vector. + /// Split a vector store into 2 stores of half the vector. SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; @@ -120,8 +122,11 @@ protected: SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl<SDValue> &Results) const; - void analyzeFormalArgumentsCompute(CCState &State, - const SmallVectorImpl<ISD::InputArg> &Ins) const; + + void analyzeFormalArgumentsCompute( + CCState &State, + const SmallVectorImpl<ISD::InputArg> &Ins) const; + public: AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI); @@ -136,6 +141,10 @@ public: return false; } + static inline SDValue stripBitcast(SDValue Val) { + return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; + } + static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 4); bool isFAbsFree(EVT VT) const override; @@ -146,7 +155,6 @@ public: bool isZExtFree(Type *Src, Type *Dest) const override; bool isZExtFree(EVT Src, EVT Dest) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; - bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const override; bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; @@ -168,6 +176,7 @@ public: bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; + bool isSDNodeAlwaysUniform(const SDNode *N) const override; static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg); static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg); @@ -224,7 +233,7 @@ public: virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const = 0; - /// \brief Determine which of the bits specified in \p Mask are known to be + /// Determine which of the bits specified in \p Mask are known to be /// either zero or one and return them in the \p KnownZero and \p KnownOne /// bitsets. void computeKnownBitsForTargetNode(const SDValue Op, @@ -237,7 +246,7 @@ public: const SelectionDAG &DAG, unsigned Depth = 0) const override; - /// \brief Helper function that adds Reg to the LiveIn list of the DAG's + /// Helper function that adds Reg to the LiveIn list of the DAG's /// MachineFunction. /// /// \returns a RegisterSDNode representing Reg if \p RawReg is true, otherwise @@ -285,9 +294,9 @@ public: GRID_OFFSET, }; - /// \brief Helper function that returns the byte offset of the given + /// Helper function that returns the byte offset of the given /// type of implicit parameter. - uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI, + uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const; AMDGPUAS getAMDGPUAS() const { @@ -357,6 +366,7 @@ enum NodeType : unsigned { FMED3, SMED3, UMED3, + FDOT2, URECIP, DIV_SCALE, DIV_FMAS, @@ -372,6 +382,7 @@ enum NodeType : unsigned { RSQ, RCP_LEGACY, RSQ_LEGACY, + RCP_IFLAG, FMUL_LEGACY, RSQ_CLAMP, LDEXP, @@ -396,6 +407,7 @@ enum NodeType : unsigned { MAD_I64_I32, MUL_LOHI_I24, MUL_LOHI_U24, + PERM, TEXTURE_FETCH, EXPORT, // exp on SI+ EXPORT_DONE, // exp on SI+ with done bit set @@ -417,6 +429,10 @@ enum NodeType : unsigned { // Convert two float 32 numbers into a single register holding two packed f16 // with round to zero. CVT_PKRTZ_F16_F32, + CVT_PKNORM_I16_F32, + CVT_PKNORM_U16_F32, + CVT_PK_I16_I32, + CVT_PK_U16_U32, // Same as the standard node, except the high bits of the resulting integer // are known 0. @@ -451,14 +467,21 @@ enum NodeType : unsigned { LOAD_CONSTANT, TBUFFER_STORE_FORMAT, TBUFFER_STORE_FORMAT_X3, + TBUFFER_STORE_FORMAT_D16, TBUFFER_LOAD_FORMAT, + TBUFFER_LOAD_FORMAT_D16, ATOMIC_CMP_SWAP, ATOMIC_INC, ATOMIC_DEC, + ATOMIC_LOAD_FADD, + ATOMIC_LOAD_FMIN, + ATOMIC_LOAD_FMAX, BUFFER_LOAD, BUFFER_LOAD_FORMAT, + BUFFER_LOAD_FORMAT_D16, BUFFER_STORE, BUFFER_STORE_FORMAT, + BUFFER_STORE_FORMAT_D16, BUFFER_ATOMIC_SWAP, BUFFER_ATOMIC_ADD, BUFFER_ATOMIC_SUB, @@ -470,6 +493,7 @@ enum NodeType : unsigned { BUFFER_ATOMIC_OR, BUFFER_ATOMIC_XOR, BUFFER_ATOMIC_CMPSWAP, + LAST_AMDGPU_ISD_NUMBER }; diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp index ff9e7b50ed5c..35dd9eb0a478 100644 --- a/lib/Target/AMDGPU/AMDGPUInline.cpp +++ b/lib/Target/AMDGPU/AMDGPUInline.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief This is AMDGPU specific replacement of the standard inliner. +/// This is AMDGPU specific replacement of the standard inliner. /// The main purpose is to account for the fact that calls not only expensive /// on the AMDGPU, but much more expensive if a private memory pointer is /// passed to a function as an argument. In this situation, we are unable to @@ -161,8 +161,8 @@ static bool isWrapperOnlyCall(CallSite CS) { return false; } if (isa<ReturnInst>(*std::next(I->getIterator()))) { - DEBUG(dbgs() << " Wrapper only call detected: " - << Callee->getName() << '\n'); + LLVM_DEBUG(dbgs() << " Wrapper only call detected: " + << Callee->getName() << '\n'); return true; } } diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index 8156599528c2..07aa7c2cc8ad 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -16,95 +16,36 @@ #include "AMDGPUInstrInfo.h" #include "AMDGPURegisterInfo.h" #include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" using namespace llvm; -#define GET_INSTRINFO_CTOR_DTOR -#include "AMDGPUGenInstrInfo.inc" - // Pin the vtable to this file. -void AMDGPUInstrInfo::anchor() {} - -AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST) - : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), - ST(ST), - AMDGPUASI(ST.getAMDGPUAS()) {} - -// FIXME: This behaves strangely. If, for example, you have 32 load + stores, -// the first 16 loads will be interleaved with the stores, and the next 16 will -// be clustered as expected. It should really split into 2 16 store batches. -// -// Loads are clustered until this returns false, rather than trying to schedule -// groups of stores. This also means we have to deal with saying different -// address space loads should be clustered, and ones which might cause bank -// conflicts. -// -// This might be deprecated so it might not be worth that much effort to fix. -bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, - int64_t Offset0, int64_t Offset1, - unsigned NumLoads) const { - assert(Offset1 > Offset0 && - "Second offset should be larger than first offset!"); - // If we have less than 16 loads in a row, and the offsets are within 64 - // bytes, then schedule together. - - // A cacheline is 64 bytes (for global memory). - return (NumLoads <= 16 && (Offset1 - Offset0) < 64); -} - -// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td -enum SIEncodingFamily { - SI = 0, - VI = 1, - SDWA = 2, - SDWA9 = 3, - GFX9 = 4 -}; - -static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) { - switch (ST.getGeneration()) { - case AMDGPUSubtarget::SOUTHERN_ISLANDS: - case AMDGPUSubtarget::SEA_ISLANDS: - return SIEncodingFamily::SI; - case AMDGPUSubtarget::VOLCANIC_ISLANDS: - case AMDGPUSubtarget::GFX9: - return SIEncodingFamily::VI; - - // FIXME: This should never be called for r600 GPUs. - case AMDGPUSubtarget::R600: - case AMDGPUSubtarget::R700: - case AMDGPUSubtarget::EVERGREEN: - case AMDGPUSubtarget::NORTHERN_ISLANDS: - return SIEncodingFamily::SI; - } - - llvm_unreachable("Unknown subtarget generation!"); -} - -int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { - SIEncodingFamily Gen = subtargetEncodingFamily(ST); +//void AMDGPUInstrInfo::anchor() {} - if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && - ST.getGeneration() >= AMDGPUSubtarget::GFX9) - Gen = SIEncodingFamily::GFX9; +AMDGPUInstrInfo::AMDGPUInstrInfo(const GCNSubtarget &ST) { } - if (get(Opcode).TSFlags & SIInstrFlags::SDWA) - Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9 - : SIEncodingFamily::SDWA; - int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); +// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence. +bool AMDGPUInstrInfo::isUniformMMO(const MachineMemOperand *MMO) { + const Value *Ptr = MMO->getValue(); + // UndefValue means this is a load of a kernel input. These are uniform. + // Sometimes LDS instructions have constant pointers. + // If Ptr is null, then that means this mem operand contains a + // PseudoSourceValue like GOT. + if (!Ptr || isa<UndefValue>(Ptr) || + isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) + return true; - // -1 means that Opcode is already a native instruction. - if (MCOp == -1) - return Opcode; + if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) + return true; - // (uint16_t)-1 means that Opcode is a pseudo instruction that has - // no encoding in the given subtarget generation. - if (MCOp == (uint16_t)-1) - return -1; + if (const Argument *Arg = dyn_cast<Argument>(Ptr)) + return AMDGPU::isArgPassedInSGPR(Arg); - return MCOp; + const Instruction *I = dyn_cast<Instruction>(Ptr); + return I && I->getMetadata("amdgpu.uniform"); } diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h index a9fcd4834638..2f8166da0d33 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Contains the definition of a TargetInstrInfo class that is common +/// Contains the definition of a TargetInstrInfo class that is common /// to all AMD GPUs. // //===----------------------------------------------------------------------===// @@ -20,37 +20,43 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" -#define GET_INSTRINFO_HEADER -#include "AMDGPUGenInstrInfo.inc" -#undef GET_INSTRINFO_HEADER - namespace llvm { -class AMDGPUSubtarget; +class GCNSubtarget; class MachineFunction; class MachineInstr; class MachineInstrBuilder; -class AMDGPUInstrInfo : public AMDGPUGenInstrInfo { -private: - const AMDGPUSubtarget &ST; +class AMDGPUInstrInfo { +public: + explicit AMDGPUInstrInfo(const GCNSubtarget &st); - virtual void anchor(); -protected: - AMDGPUAS AMDGPUASI; + static bool isUniformMMO(const MachineMemOperand *MMO); +}; -public: - explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st); +namespace AMDGPU { - bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, - int64_t Offset1, int64_t Offset2, - unsigned NumLoads) const override; +struct RsrcIntrinsic { + unsigned Intr; + uint8_t RsrcArg; + bool IsImage; +}; +const RsrcIntrinsic *lookupRsrcIntrinsic(unsigned Intr); + +struct D16ImageDimIntrinsic { + unsigned Intr; + unsigned D16HelperIntr; +}; +const D16ImageDimIntrinsic *lookupD16ImageDimIntrinsic(unsigned Intr); - /// \brief Return a target-specific opcode if Opcode is a pseudo instruction. - /// Return -1 if the target-specific opcode for the pseudo instruction does - /// not exist. If Opcode is not a pseudo instruction, this is identity. - int pseudoToMCOpcode(int Opcode) const; +struct ImageDimIntrinsicInfo { + unsigned Intr; + unsigned BaseOpcode; + MIMGDim Dim; }; +const ImageDimIntrinsicInfo *getImageDimIntrinsicInfo(unsigned Intr); + +} // end AMDGPU namespace } // End llvm namespace #endif diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index c024010f3e96..96b7568eec1f 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -35,6 +35,10 @@ def AMDGPUFPPackOp : SDTypeProfile<1, 2, [SDTCisFP<1>, SDTCisSameAs<1, 2>] >; +def AMDGPUIntPackOp : SDTypeProfile<1, 2, + [SDTCisInt<1>, SDTCisSameAs<1, 2>] +>; + def AMDGPUDivScaleOp : SDTypeProfile<2, 3, [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>] >; @@ -136,12 +140,18 @@ def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; def AMDGPUrcp_legacy : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>; def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; +def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>; + // out = 1.0 / sqrt(a) result clamped to +/- max_float. def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>; def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>; +def AMDGPUpknorm_i16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>; +def AMDGPUpknorm_u16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>; +def AMDGPUpk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>; +def AMDGPUpk_u16_u32 : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>; def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>; def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>; @@ -160,8 +170,6 @@ def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp, [SDNPCommutative, SDNPAssociative] >; -def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>; - // out = min(a, b) a and b are floats, where a nan comparison fails. def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp, [] @@ -333,6 +341,13 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; +def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2", + SDTypeProfile<1, 3, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>, + SDTCisFP<0>, SDTCisVec<1>]>, + []>; + +def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>; + def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC", SDTypeProfile<0, 1, [SDTCisInt<0>]>, [SDNPHasChain, SDNPInGlue]>; diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 16d240e96196..219d430fbb39 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -17,6 +17,12 @@ #include "AMDGPURegisterBankInfo.h" #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" +#include "SIMachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" @@ -30,10 +36,48 @@ using namespace llvm; +#define GET_GLOBALISEL_IMPL +#define AMDGPUSubtarget GCNSubtarget +#include "AMDGPUGenGlobalISel.inc" +#undef GET_GLOBALISEL_IMPL +#undef AMDGPUSubtarget + AMDGPUInstructionSelector::AMDGPUInstructionSelector( - const SISubtarget &STI, const AMDGPURegisterBankInfo &RBI) + const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, + const AMDGPUTargetMachine &TM) : InstructionSelector(), TII(*STI.getInstrInfo()), - TRI(*STI.getRegisterInfo()), RBI(RBI), AMDGPUASI(STI.getAMDGPUAS()) {} + TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM), + STI(STI), + EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG), +#define GET_GLOBALISEL_PREDICATES_INIT +#include "AMDGPUGenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATES_INIT +#define GET_GLOBALISEL_TEMPORARIES_INIT +#include "AMDGPUGenGlobalISel.inc" +#undef GET_GLOBALISEL_TEMPORARIES_INIT + ,AMDGPUASI(STI.getAMDGPUAS()) +{ +} + +const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; } + +bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + I.setDesc(TII.get(TargetOpcode::COPY)); + for (const MachineOperand &MO : I.operands()) { + if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + continue; + + const TargetRegisterClass *RC = + TRI.getConstrainedRegClassForOperand(MO, MRI); + if (!RC) + continue; + RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); + } + return true; +} MachineOperand AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, @@ -71,6 +115,10 @@ AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO, } } +static int64_t getConstant(const MachineInstr *MI) { + return MI->getOperand(1).getCImm()->getSExtValue(); +} + bool AMDGPUInstructionSelector::selectG_ADD(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); @@ -118,12 +166,144 @@ bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const { return selectG_ADD(I); } +bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const MachineOperand &MO = I.getOperand(0); + const TargetRegisterClass *RC = + TRI.getConstrainedRegClassForOperand(MO, MRI); + if (RC) + RBI.constrainGenericRegister(MO.getReg(), *RC, MRI); + I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); + return true; +} + +bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I, + CodeGenCoverage &CoverageInfo) const { + unsigned IntrinsicID = I.getOperand(1).getIntrinsicID(); + + switch (IntrinsicID) { + default: + break; + case Intrinsic::maxnum: + case Intrinsic::minnum: + case Intrinsic::amdgcn_cvt_pkrtz: + return selectImpl(I, CoverageInfo); + + case Intrinsic::amdgcn_kernarg_segment_ptr: { + MachineFunction *MF = I.getParent()->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + const ArgDescriptor *InputPtrReg; + const TargetRegisterClass *RC; + const DebugLoc &DL = I.getDebugLoc(); + + std::tie(InputPtrReg, RC) + = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); + if (!InputPtrReg) + report_fatal_error("missing kernarg segment ptr"); + + BuildMI(*I.getParent(), &I, DL, TII.get(AMDGPU::COPY)) + .add(I.getOperand(0)) + .addReg(MRI.getLiveInVirtReg(InputPtrReg->getRegister())); + I.eraseFromParent(); + return true; + } + } + return false; +} + +static MachineInstr * +buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt, + unsigned Reg0, unsigned Reg1, unsigned Reg2, unsigned Reg3, + unsigned VM, bool Compr, unsigned Enabled, bool Done) { + const DebugLoc &DL = Insert->getDebugLoc(); + MachineBasicBlock &BB = *Insert->getParent(); + unsigned Opcode = Done ? AMDGPU::EXP_DONE : AMDGPU::EXP; + return BuildMI(BB, Insert, DL, TII.get(Opcode)) + .addImm(Tgt) + .addReg(Reg0) + .addReg(Reg1) + .addReg(Reg2) + .addReg(Reg3) + .addImm(VM) + .addImm(Compr) + .addImm(Enabled); +} + +bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( + MachineInstr &I, + CodeGenCoverage &CoverageInfo) const { + MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + unsigned IntrinsicID = I.getOperand(0).getIntrinsicID(); + switch (IntrinsicID) { + case Intrinsic::amdgcn_exp: { + int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg())); + int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg())); + int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(7).getReg())); + int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(8).getReg())); + + MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(), + I.getOperand(4).getReg(), + I.getOperand(5).getReg(), + I.getOperand(6).getReg(), + VM, false, Enabled, Done); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI); + } + case Intrinsic::amdgcn_exp_compr: { + const DebugLoc &DL = I.getDebugLoc(); + int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg())); + int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg())); + unsigned Reg0 = I.getOperand(3).getReg(); + unsigned Reg1 = I.getOperand(4).getReg(); + unsigned Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(5).getReg())); + int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(6).getReg())); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); + MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM, + true, Enabled, Done); + + I.eraseFromParent(); + return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI); + } + } + return false; +} + bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); + MachineFunction *MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); DebugLoc DL = I.getDebugLoc(); + unsigned StoreSize = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI); + unsigned Opcode; // FIXME: Select store instruction based on address space - MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(AMDGPU::FLAT_STORE_DWORD)) + switch (StoreSize) { + default: + return false; + case 32: + Opcode = AMDGPU::FLAT_STORE_DWORD; + break; + case 64: + Opcode = AMDGPU::FLAT_STORE_DWORDX2; + break; + case 96: + Opcode = AMDGPU::FLAT_STORE_DWORDX3; + break; + case 128: + Opcode = AMDGPU::FLAT_STORE_DWORDX4; + break; + } + + MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode)) .add(I.getOperand(1)) .add(I.getOperand(0)) .addImm(0) // offset @@ -143,36 +323,67 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineOperand &ImmOp = I.getOperand(1); + + // The AMDGPU backend only supports Imm operands and not CImm or FPImm. + if (ImmOp.isFPImm()) { + const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt(); + ImmOp.ChangeToImmediate(Imm.getZExtValue()); + } else if (ImmOp.isCImm()) { + ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue()); + } + unsigned DstReg = I.getOperand(0).getReg(); - unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI); + unsigned Size; + bool IsSgpr; + const RegisterBank *RB = MRI.getRegBankOrNull(I.getOperand(0).getReg()); + if (RB) { + IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID; + Size = MRI.getType(DstReg).getSizeInBits(); + } else { + const TargetRegisterClass *RC = TRI.getRegClassForReg(MRI, DstReg); + IsSgpr = TRI.isSGPRClass(RC); + Size = TRI.getRegSizeInBits(*RC); + } + if (Size != 32 && Size != 64) + return false; + + unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; if (Size == 32) { - I.setDesc(TII.get(AMDGPU::S_MOV_B32)); + I.setDesc(TII.get(Opcode)); + I.addImplicitDefUseOperands(*MF); return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } - assert(Size == 64); - DebugLoc DL = I.getDebugLoc(); - unsigned LoReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - unsigned HiReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - const APInt &Imm = I.getOperand(1).getCImm()->getValue(); + const TargetRegisterClass *RC = IsSgpr ? &AMDGPU::SReg_32_XM0RegClass : + &AMDGPU::VGPR_32RegClass; + unsigned LoReg = MRI.createVirtualRegister(RC); + unsigned HiReg = MRI.createVirtualRegister(RC); + const APInt &Imm = APInt(Size, I.getOperand(1).getImm()); - BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), LoReg) + BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg) .addImm(Imm.trunc(32).getZExtValue()); - BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg) + BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg) .addImm(Imm.ashr(32).getZExtValue()); - BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) - .addReg(LoReg) - .addImm(AMDGPU::sub0) - .addReg(HiReg) - .addImm(AMDGPU::sub1); + const MachineInstr *RS = + BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) + .addReg(LoReg) + .addImm(AMDGPU::sub0) + .addReg(HiReg) + .addImm(AMDGPU::sub1); + // We can't call constrainSelectedInstRegOperands here, because it doesn't // work for target independent opcodes I.eraseFromParent(); - return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI); + const TargetRegisterClass *DstRC = + TRI.getConstrainedRegClassForOperand(RS->getOperand(0), MRI); + if (!DstRC) + return true; + return RBI.constrainGenericRegister(DstReg, *DstRC, MRI); } static bool isConstant(const MachineInstr &MI) { @@ -228,6 +439,9 @@ static bool isInstrUniform(const MachineInstr &MI) { isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) return true; + if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) + return true; + const Instruction *I = dyn_cast<Instruction>(Ptr); return I && I->getMetadata("amdgpu.uniform"); } @@ -292,7 +506,8 @@ bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I, if (!I.hasOneMemOperand()) return false; - if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS) + if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS && + (*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT) return false; if (!isInstrUniform(I)) @@ -303,7 +518,7 @@ bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I, MachineBasicBlock *BB = I.getParent(); MachineFunction *MF = BB->getParent(); - const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>(); + const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); MachineRegisterInfo &MRI = MF->getRegInfo(); unsigned DstReg = I.getOperand(0).getReg(); const DebugLoc &DL = I.getDebugLoc(); @@ -405,18 +620,30 @@ bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const { bool AMDGPUInstructionSelector::select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const { - if (!isPreISelGenericOpcode(I.getOpcode())) + if (!isPreISelGenericOpcode(I.getOpcode())) { + if (I.isCopy()) + return selectCOPY(I); return true; + } switch (I.getOpcode()) { default: - break; + return selectImpl(I, CoverageInfo); case TargetOpcode::G_ADD: return selectG_ADD(I); + case TargetOpcode::G_BITCAST: + return selectCOPY(I); case TargetOpcode::G_CONSTANT: + case TargetOpcode::G_FCONSTANT: return selectG_CONSTANT(I); case TargetOpcode::G_GEP: return selectG_GEP(I); + case TargetOpcode::G_IMPLICIT_DEF: + return selectG_IMPLICIT_DEF(I); + case TargetOpcode::G_INTRINSIC: + return selectG_INTRINSIC(I, CoverageInfo); + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: + return selectG_INTRINSIC_W_SIDE_EFFECTS(I, CoverageInfo); case TargetOpcode::G_LOAD: return selectG_LOAD(I); case TargetOpcode::G_STORE: @@ -424,3 +651,47 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I, } return false; } + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const { + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(Root); } + }}; + +} + +/// +/// This will select either an SGPR or VGPR operand and will save us from +/// having to write an extra tablegen pattern. +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const { + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(Root); } + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const { + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src0_mods + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod + }}; +} +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const { + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod + }}; +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const { + return {{ + [=](MachineInstrBuilder &MIB) { MIB.add(Root); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods + }}; +} diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 715c4882f380..68b40b20aca2 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -15,27 +15,39 @@ #define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H #include "AMDGPU.h" +#include "AMDGPUArgumentUsageInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +namespace { +#define GET_GLOBALISEL_PREDICATE_BITSET +#define AMDGPUSubtarget GCNSubtarget +#include "AMDGPUGenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATE_BITSET +#undef AMDGPUSubtarget +} + namespace llvm { class AMDGPUInstrInfo; class AMDGPURegisterBankInfo; +class GCNSubtarget; class MachineInstr; class MachineOperand; class MachineRegisterInfo; class SIInstrInfo; +class SIMachineFunctionInfo; class SIRegisterInfo; -class SISubtarget; class AMDGPUInstructionSelector : public InstructionSelector { public: - AMDGPUInstructionSelector(const SISubtarget &STI, - const AMDGPURegisterBankInfo &RBI); + AMDGPUInstructionSelector(const GCNSubtarget &STI, + const AMDGPURegisterBankInfo &RBI, + const AMDGPUTargetMachine &TM); bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override; + static const char *getName(); private: struct GEPInfo { @@ -46,10 +58,18 @@ private: GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { } }; + /// tblgen-erated 'select' implementation. + bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; + MachineOperand getSubOperand64(MachineOperand &MO, unsigned SubIdx) const; + bool selectCOPY(MachineInstr &I) const; bool selectG_CONSTANT(MachineInstr &I) const; bool selectG_ADD(MachineInstr &I) const; bool selectG_GEP(MachineInstr &I) const; + bool selectG_IMPLICIT_DEF(MachineInstr &I) const; + bool selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; + bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I, + CodeGenCoverage &CoverageInfo) const; bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const; void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const; @@ -57,9 +77,35 @@ private: bool selectG_LOAD(MachineInstr &I) const; bool selectG_STORE(MachineInstr &I) const; + InstructionSelector::ComplexRendererFns + selectVCSRC(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns + selectVSRC0(MachineOperand &Root) const; + + InstructionSelector::ComplexRendererFns + selectVOP3Mods0(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVOP3OMods(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVOP3Mods(MachineOperand &Root) const; + const SIInstrInfo &TII; const SIRegisterInfo &TRI; const AMDGPURegisterBankInfo &RBI; + const AMDGPUTargetMachine &TM; + const GCNSubtarget &STI; + bool EnableLateStructurizeCFG; +#define GET_GLOBALISEL_PREDICATES_DECL +#define AMDGPUSubtarget GCNSubtarget +#include "AMDGPUGenGlobalISel.inc" +#undef GET_GLOBALISEL_PREDICATES_DECL +#undef AMDGPUSubtarget + +#define GET_GLOBALISEL_TEMPORARIES_DECL +#include "AMDGPUGenGlobalISel.inc" +#undef GET_GLOBALISEL_TEMPORARIES_DECL + protected: AMDGPUAS AMDGPUASI; }; diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index 31f728b0c22f..9426df399597 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -42,6 +42,47 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm = "", field bits<32> Inst = 0xffffffff; } +//===---------------------------------------------------------------------===// +// Return instruction +//===---------------------------------------------------------------------===// + +class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern> +: Instruction { + + let Namespace = "AMDGPU"; + dag OutOperandList = outs; + dag InOperandList = ins; + let Pattern = pattern; + let AsmString = !strconcat(asmstr, "\n"); + let isPseudo = 1; + let Itinerary = NullALU; + bit hasIEEEFlag = 0; + bit hasZeroOpFlag = 0; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let isCodeGenOnly = 1; +} + +def TruePredicate : Predicate<"true">; + +// Exists to help track down where SubtargetPredicate isn't set rather +// than letting tablegen crash with an unhelpful error. +def InvalidPred : Predicate<"predicate not set on instruction or pattern">; + +class PredicateControl { + Predicate SubtargetPredicate = InvalidPred; + list<Predicate> AssemblerPredicates = []; + Predicate AssemblerPredicate = TruePredicate; + list<Predicate> OtherPredicates = []; + list<Predicate> Predicates = !listconcat([SubtargetPredicate, + AssemblerPredicate], + AssemblerPredicates, + OtherPredicates); +} +class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>, + PredicateControl; + def FP16Denormals : Predicate<"Subtarget->hasFP16Denormals()">; def FP32Denormals : Predicate<"Subtarget->hasFP32Denormals()">; def FP64Denormals : Predicate<"Subtarget->hasFP64Denormals()">; @@ -52,7 +93,6 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; def FMA : Predicate<"Subtarget->hasFMA()">; def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>; -def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>; def u16ImmTarget : AsmOperandClass { let Name = "U16Imm"; @@ -95,12 +135,6 @@ def brtarget : Operand<OtherVT>; // Misc. PatFrags //===----------------------------------------------------------------------===// -class HasOneUseUnaryOp<SDPatternOperator op> : PatFrag< - (ops node:$src0), - (op $src0), - [{ return N->hasOneUse(); }] ->; - class HasOneUseBinOp<SDPatternOperator op> : PatFrag< (ops node:$src0, node:$src1), (op $src0, $src1), @@ -113,8 +147,6 @@ class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag< [{ return N->hasOneUse(); }] >; -def trunc_oneuse : HasOneUseUnaryOp<trunc>; - let Properties = [SDNPCommutative, SDNPAssociative] in { def smax_oneuse : HasOneUseBinOp<smax>; def smin_oneuse : HasOneUseBinOp<smin>; @@ -127,6 +159,7 @@ def or_oneuse : HasOneUseBinOp<or>; def xor_oneuse : HasOneUseBinOp<xor>; } // Properties = [SDNPCommutative, SDNPAssociative] +def add_oneuse : HasOneUseBinOp<add>; def sub_oneuse : HasOneUseBinOp<sub>; def srl_oneuse : HasOneUseBinOp<srl>; @@ -240,6 +273,37 @@ def COND_NULL : PatLeaf < [{(void)N; return false;}] >; +//===----------------------------------------------------------------------===// +// PatLeafs for Texture Constants +//===----------------------------------------------------------------------===// + +def TEX_ARRAY : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 9 || TType == 10 || TType == 16; + }] +>; + +def TEX_RECT : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 5; + }] +>; + +def TEX_SHADOW : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return (TType >= 6 && TType <= 8) || TType == 13; + }] +>; + +def TEX_SHADOW_ARRAY : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 11 || TType == 12 || TType == 17; + }] +>; //===----------------------------------------------------------------------===// // Load/Store Pattern Fragments @@ -249,6 +313,10 @@ class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{ return cast<MemSDNode>(N)->getAlignment() % 8 == 0; }]>; +class Aligned16Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{ + return cast<MemSDNode>(N)->getAlignment() >= 16; +}]>; + class LoadFrag <SDPatternOperator op> : PatFrag<(ops node:$ptr), (op node:$ptr)>; class StoreFrag<SDPatternOperator op> : PatFrag < @@ -361,21 +429,31 @@ def az_extloadi8_local : LocalLoad <az_extloadi8>; def sextloadi8_local : LocalLoad <sextloadi8>; def az_extloadi16_local : LocalLoad <az_extloadi16>; def sextloadi16_local : LocalLoad <sextloadi16>; +def atomic_load_32_local : LocalLoad<atomic_load_32>; +def atomic_load_64_local : LocalLoad<atomic_load_64>; def store_local : LocalStore <store>; def truncstorei8_local : LocalStore <truncstorei8>; def truncstorei16_local : LocalStore <truncstorei16>; def store_local_hi16 : StoreHi16 <truncstorei16>, LocalAddress; def truncstorei8_local_hi16 : StoreHi16<truncstorei8>, LocalAddress; +def atomic_store_local : LocalStore <atomic_store>; def load_align8_local : Aligned8Bytes < (ops node:$ptr), (load_local node:$ptr) >; +def load_align16_local : Aligned16Bytes < + (ops node:$ptr), (load_local node:$ptr) +>; + def store_align8_local : Aligned8Bytes < (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr) >; +def store_align16_local : Aligned16Bytes < + (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr) +>; def load_flat : FlatLoad <load>; def az_extloadi8_flat : FlatLoad <az_extloadi8>; @@ -571,6 +649,18 @@ multiclass BFIPatterns <Instruction BFI_INT, (BFI_INT $x, $y, $z) >; + // 64-bit version + def : AMDGPUPat < + (or (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))), + (REG_SEQUENCE RC64, + (BFI_INT (i32 (EXTRACT_SUBREG $x, sub0)), + (i32 (EXTRACT_SUBREG $y, sub0)), + (i32 (EXTRACT_SUBREG $z, sub0))), sub0, + (BFI_INT (i32 (EXTRACT_SUBREG $x, sub1)), + (i32 (EXTRACT_SUBREG $y, sub1)), + (i32 (EXTRACT_SUBREG $z, sub1))), sub1) + >; + // SHA-256 Ch function // z ^ (x & (y ^ z)) def : AMDGPUPat < @@ -578,6 +668,18 @@ multiclass BFIPatterns <Instruction BFI_INT, (BFI_INT $x, $y, $z) >; + // 64-bit version + def : AMDGPUPat < + (xor i64:$z, (and i64:$x, (xor i64:$y, i64:$z))), + (REG_SEQUENCE RC64, + (BFI_INT (i32 (EXTRACT_SUBREG $x, sub0)), + (i32 (EXTRACT_SUBREG $y, sub0)), + (i32 (EXTRACT_SUBREG $z, sub0))), sub0, + (BFI_INT (i32 (EXTRACT_SUBREG $x, sub1)), + (i32 (EXTRACT_SUBREG $y, sub1)), + (i32 (EXTRACT_SUBREG $z, sub1))), sub1) + >; + def : AMDGPUPat < (fcopysign f32:$src0, f32:$src1), (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, $src1) @@ -611,10 +713,25 @@ multiclass BFIPatterns <Instruction BFI_INT, // SHA-256 Ma patterns // ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y -class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : AMDGPUPat < - (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))), - (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y) ->; +multiclass SHA256MaPattern <Instruction BFI_INT, Instruction XOR, RegisterClass RC64> { + def : AMDGPUPat < + (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))), + (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y) + >; + + def : AMDGPUPat < + (or (and i64:$x, i64:$z), (and i64:$y, (or i64:$x, i64:$z))), + (REG_SEQUENCE RC64, + (BFI_INT (XOR (i32 (EXTRACT_SUBREG $x, sub0)), + (i32 (EXTRACT_SUBREG $y, sub0))), + (i32 (EXTRACT_SUBREG $z, sub0)), + (i32 (EXTRACT_SUBREG $y, sub0))), sub0, + (BFI_INT (XOR (i32 (EXTRACT_SUBREG $x, sub1)), + (i32 (EXTRACT_SUBREG $y, sub1))), + (i32 (EXTRACT_SUBREG $z, sub1)), + (i32 (EXTRACT_SUBREG $y, sub1))), sub1) + >; +} // Bitfield extract patterns @@ -633,14 +750,33 @@ multiclass BFEPattern <Instruction UBFE, Instruction SBFE, Instruction MOV> { (UBFE $src, $rshift, (MOV (i32 (IMMPopCount $mask)))) >; + // x & ((1 << y) - 1) + def : AMDGPUPat < + (and i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)), + (UBFE $src, (MOV (i32 0)), $width) + >; + + // x & ~(-1 << y) + def : AMDGPUPat < + (and i32:$src, (xor_oneuse (shl_oneuse -1, i32:$width), -1)), + (UBFE $src, (MOV (i32 0)), $width) + >; + + // x & (-1 >> (bitwidth - y)) + def : AMDGPUPat < + (and i32:$src, (srl_oneuse -1, (sub 32, i32:$width))), + (UBFE $src, (MOV (i32 0)), $width) + >; + + // x << (bitwidth - y) >> (bitwidth - y) def : AMDGPUPat < (srl (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)), - (UBFE $src, (i32 0), $width) + (UBFE $src, (MOV (i32 0)), $width) >; def : AMDGPUPat < (sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)), - (SBFE $src, (i32 0), $width) + (SBFE $src, (MOV (i32 0)), $width) >; } @@ -697,11 +833,3 @@ class RsqPat<Instruction RsqInst, ValueType vt> : AMDGPUPat < (AMDGPUrcp (fsqrt vt:$src)), (RsqInst $src) >; - -include "R600Instructions.td" -include "R700Instructions.td" -include "EvergreenInstructions.td" -include "CaymanInstructions.td" - -include "SIInstrInfo.td" - diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp index 86dc9bd9ea74..896e2055cf62 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp @@ -8,7 +8,7 @@ //==-----------------------------------------------------------------------===// // /// \file -/// \brief AMDGPU Implementation of the IntrinsicInfo class. +/// AMDGPU Implementation of the IntrinsicInfo class. // //===-----------------------------------------------------------------------===// @@ -25,13 +25,13 @@ AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo() static const char *const IntrinsicNameTable[] = { #define GET_INTRINSIC_NAME_TABLE -#include "AMDGPUGenIntrinsics.inc" +#include "AMDGPUGenIntrinsicImpl.inc" #undef GET_INTRINSIC_NAME_TABLE }; namespace { #define GET_INTRINSIC_ATTRIBUTES -#include "AMDGPUGenIntrinsics.inc" +#include "AMDGPUGenIntrinsicImpl.inc" #undef GET_INTRINSIC_ATTRIBUTES } @@ -80,7 +80,7 @@ unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData, bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { // Overload Table #define GET_INTRINSIC_OVERLOAD_TABLE -#include "AMDGPUGenIntrinsics.inc" +#include "AMDGPUGenIntrinsicImpl.inc" #undef GET_INTRINSIC_OVERLOAD_TABLE } diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h index 6cb8b9644642..ef42f9a319af 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h +++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h @@ -8,7 +8,7 @@ //==-----------------------------------------------------------------------===// // /// \file -/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class. +/// Interface for the AMDGPU Implementation of the Intrinsic Info class. // //===-----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H @@ -24,7 +24,7 @@ namespace AMDGPUIntrinsic { enum ID { last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1, #define GET_INTRINSIC_ENUM_VALUES -#include "AMDGPUGenIntrinsics.inc" +#include "AMDGPUGenIntrinsicEnums.inc" #undef GET_INTRINSIC_ENUM_VALUES , num_AMDGPU_intrinsics }; diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/AMDGPU/AMDGPUIntrinsics.td index 18c9bd933af2..230a04628504 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsics.td +++ b/lib/Target/AMDGPU/AMDGPUIntrinsics.td @@ -13,7 +13,4 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; - def int_AMDGPU_kilp : Intrinsic<[], [], []>; } - -include "SIIntrinsics.td" diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index b4704f6feb92..87b072c9ea20 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -12,7 +12,9 @@ /// \todo This should be generated by TableGen. //===----------------------------------------------------------------------===// +#include "AMDGPU.h" #include "AMDGPULegalizerInfo.h" +#include "AMDGPUTargetMachine.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" @@ -20,19 +22,46 @@ #include "llvm/Support/Debug.h" using namespace llvm; +using namespace LegalizeActions; -AMDGPULegalizerInfo::AMDGPULegalizerInfo() { +AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, + const GCNTargetMachine &TM) { using namespace TargetOpcode; - const LLT S1= LLT::scalar(1); + auto GetAddrSpacePtr = [&TM](unsigned AS) { + return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); + }; + + auto AMDGPUAS = ST.getAMDGPUAS(); + + const LLT S1 = LLT::scalar(1); const LLT V2S16 = LLT::vector(2, 16); + const LLT S32 = LLT::scalar(32); const LLT S64 = LLT::scalar(64); - const LLT P1 = LLT::pointer(1, 64); - const LLT P2 = LLT::pointer(2, 64); + const LLT S512 = LLT::scalar(512); + + const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); + const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); + const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); + const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS.FLAT_ADDRESS); + const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS.PRIVATE_ADDRESS); + + const LLT AddrSpaces[] = { + GlobalPtr, + ConstantPtr, + LocalPtr, + FlatPtr, + PrivatePtr + }; setAction({G_ADD, S32}, Legal); + setAction({G_ASHR, S32}, Legal); + setAction({G_SUB, S32}, Legal); + setAction({G_MUL, S32}, Legal); setAction({G_AND, S32}, Legal); + setAction({G_OR, S32}, Legal); + setAction({G_XOR, S32}, Legal); setAction({G_BITCAST, V2S16}, Legal); setAction({G_BITCAST, 1, S32}, Legal); @@ -40,41 +69,88 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() { setAction({G_BITCAST, S32}, Legal); setAction({G_BITCAST, 1, V2S16}, Legal); + getActionDefinitionsBuilder(G_FCONSTANT) + .legalFor({S32, S64}); + + // G_IMPLICIT_DEF is a no-op so we can make it legal for any value type that + // can fit in a register. + // FIXME: We need to legalize several more operations before we can add + // a test case for size > 512. + getActionDefinitionsBuilder(G_IMPLICIT_DEF) + .legalIf([=](const LegalityQuery &Query) { + return Query.Types[0].getSizeInBits() <= 512; + }) + .clampScalar(0, S1, S512); + + getActionDefinitionsBuilder(G_CONSTANT) + .legalFor({S1, S32, S64}); + // FIXME: i1 operands to intrinsics should always be legal, but other i1 // values may not be legal. We need to figure out how to distinguish // between these two scenarios. setAction({G_CONSTANT, S1}, Legal); - setAction({G_CONSTANT, S32}, Legal); - setAction({G_CONSTANT, S64}, Legal); - - setAction({G_FCONSTANT, S32}, Legal); setAction({G_FADD, S32}, Legal); + setAction({G_FCMP, S1}, Legal); + setAction({G_FCMP, 1, S32}, Legal); + setAction({G_FCMP, 1, S64}, Legal); + setAction({G_FMUL, S32}, Legal); - setAction({G_GEP, P1}, Legal); - setAction({G_GEP, P2}, Legal); - setAction({G_GEP, 1, S64}, Legal); + setAction({G_ZEXT, S64}, Legal); + setAction({G_ZEXT, 1, S32}, Legal); + + setAction({G_FPTOSI, S32}, Legal); + setAction({G_FPTOSI, 1, S32}, Legal); + + setAction({G_SITOFP, S32}, Legal); + setAction({G_SITOFP, 1, S32}, Legal); + + setAction({G_FPTOUI, S32}, Legal); + setAction({G_FPTOUI, 1, S32}, Legal); + + for (LLT PtrTy : AddrSpaces) { + LLT IdxTy = LLT::scalar(PtrTy.getSizeInBits()); + setAction({G_GEP, PtrTy}, Legal); + setAction({G_GEP, 1, IdxTy}, Legal); + } setAction({G_ICMP, S1}, Legal); setAction({G_ICMP, 1, S32}, Legal); - setAction({G_LOAD, P1}, Legal); - setAction({G_LOAD, P2}, Legal); - setAction({G_LOAD, S32}, Legal); - setAction({G_LOAD, 1, P1}, Legal); - setAction({G_LOAD, 1, P2}, Legal); - setAction({G_OR, S32}, Legal); + getActionDefinitionsBuilder({G_LOAD, G_STORE}) + .legalIf([=, &ST](const LegalityQuery &Query) { + const LLT &Ty0 = Query.Types[0]; + + // TODO: Decompose private loads into 4-byte components. + // TODO: Illegal flat loads on SI + switch (Ty0.getSizeInBits()) { + case 32: + case 64: + case 128: + return true; + + case 96: + // XXX hasLoadX3 + return (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS); + + case 256: + case 512: + // TODO: constant loads + default: + return false; + } + }); + + setAction({G_SELECT, S32}, Legal); setAction({G_SELECT, 1, S1}, Legal); setAction({G_SHL, S32}, Legal); - setAction({G_STORE, S32}, Legal); - setAction({G_STORE, 1, P1}, Legal); // FIXME: When RegBankSelect inserts copies, it will only create new // registers with scalar types. This means we can end up with @@ -83,8 +159,54 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() { // if it sees a generic instruction which isn't legal, so we need to // tell it that scalar types are legal for pointer operands setAction({G_GEP, S64}, Legal); - setAction({G_LOAD, 1, S64}, Legal); - setAction({G_STORE, 1, S64}, Legal); + + for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { + getActionDefinitionsBuilder(Op) + .legalIf([=](const LegalityQuery &Query) { + const LLT &VecTy = Query.Types[1]; + const LLT &IdxTy = Query.Types[2]; + return VecTy.getSizeInBits() % 32 == 0 && + VecTy.getSizeInBits() <= 512 && + IdxTy.getSizeInBits() == 32; + }); + } + + // FIXME: Doesn't handle extract of illegal sizes. + getActionDefinitionsBuilder({G_EXTRACT, G_INSERT}) + .legalIf([=](const LegalityQuery &Query) { + const LLT &Ty0 = Query.Types[0]; + const LLT &Ty1 = Query.Types[1]; + return (Ty0.getSizeInBits() % 32 == 0) && + (Ty1.getSizeInBits() % 32 == 0); + }); + + // Merge/Unmerge + for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { + unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; + unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; + + getActionDefinitionsBuilder(Op) + .legalIf([=](const LegalityQuery &Query) { + const LLT &BigTy = Query.Types[BigTyIdx]; + const LLT &LitTy = Query.Types[LitTyIdx]; + return BigTy.getSizeInBits() % 32 == 0 && + LitTy.getSizeInBits() % 32 == 0 && + BigTy.getSizeInBits() <= 512; + }) + // Any vectors left are the wrong size. Scalarize them. + .fewerElementsIf([](const LegalityQuery &Query) { return true; }, + [](const LegalityQuery &Query) { + return std::make_pair( + 0, Query.Types[0].getElementType()); + }) + .fewerElementsIf([](const LegalityQuery &Query) { return true; }, + [](const LegalityQuery &Query) { + return std::make_pair( + 1, Query.Types[1].getElementType()); + }); + + } computeTables(); + verify(*ST.getInstrInfo()); } diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 291e3361f163..1cbd37c42c4b 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -19,12 +19,15 @@ namespace llvm { +class GCNTargetMachine; class LLVMContext; +class GCNSubtarget; /// This class provides the information for the target register banks. class AMDGPULegalizerInfo : public LegalizerInfo { public: - AMDGPULegalizerInfo(); + AMDGPULegalizerInfo(const GCNSubtarget &ST, + const GCNTargetMachine &TM); }; } // End llvm namespace. #endif diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp index f594767c8edb..7a7ed7a4f065 100644 --- a/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief This file does AMD library function optimizations. +/// This file does AMD library function optimizations. // //===----------------------------------------------------------------------===// @@ -765,8 +765,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) { ArrayRef<double> tmp(DVal); nval = ConstantDataVector::get(context, tmp); } - DEBUG(errs() << "AMDIC: " << *CI - << " ---> " << *nval << "\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n"); replaceCall(nval); return true; } @@ -776,8 +775,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) { for (int i = 0; i < sz; ++i) { if (CF->isExactlyValue(ftbl[i].input)) { Value *nval = ConstantFP::get(CF->getType(), ftbl[i].result); - DEBUG(errs() << "AMDIC: " << *CI - << " ---> " << *nval << "\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n"); replaceCall(nval); return true; } @@ -798,11 +796,11 @@ bool AMDGPULibCalls::replaceWithNative(CallInst *CI, const FuncInfo &FInfo) { AMDGPULibFunc nf = FInfo; nf.setPrefix(AMDGPULibFunc::NATIVE); if (Constant *FPExpr = getFunction(M, nf)) { - DEBUG(dbgs() << "AMDIC: " << *CI << " ---> "); + LLVM_DEBUG(dbgs() << "AMDIC: " << *CI << " ---> "); CI->setCalledFunction(FPExpr); - DEBUG(dbgs() << *CI << '\n'); + LLVM_DEBUG(dbgs() << *CI << '\n'); return true; } @@ -820,8 +818,7 @@ bool AMDGPULibCalls::fold_recip(CallInst *CI, IRBuilder<> &B, Value *nval = B.CreateFDiv(ConstantFP::get(CF->getType(), 1.0), opr0, "recip2div"); - DEBUG(errs() << "AMDIC: " << *CI - << " ---> " << *nval << "\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n"); replaceCall(nval); return true; } @@ -899,7 +896,7 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0) || CZero) { // pow/powr/pown(x, 0) == 1 - DEBUG(errs() << "AMDIC: " << *CI << " ---> 1\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1\n"); Constant *cnval = ConstantFP::get(eltType, 1.0); if (getVecSize(FInfo) > 1) { cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval); @@ -909,23 +906,21 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, } if ((CF && CF->isExactlyValue(1.0)) || (CINT && ci_opr1 == 1)) { // pow/powr/pown(x, 1.0) = x - DEBUG(errs() << "AMDIC: " << *CI - << " ---> " << *opr0 << "\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << "\n"); replaceCall(opr0); return true; } if ((CF && CF->isExactlyValue(2.0)) || (CINT && ci_opr1 == 2)) { // pow/powr/pown(x, 2.0) = x*x - DEBUG(errs() << "AMDIC: " << *CI - << " ---> " << *opr0 << " * " << *opr0 << "\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " * " << *opr0 + << "\n"); Value *nval = B.CreateFMul(opr0, opr0, "__pow2"); replaceCall(nval); return true; } if ((CF && CF->isExactlyValue(-1.0)) || (CINT && ci_opr1 == -1)) { // pow/powr/pown(x, -1.0) = 1.0/x - DEBUG(errs() << "AMDIC: " << *CI - << " ---> 1 / " << *opr0 << "\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1 / " << *opr0 << "\n"); Constant *cnval = ConstantFP::get(eltType, 1.0); if (getVecSize(FInfo) > 1) { cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval); @@ -942,8 +937,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT : AMDGPULibFunc::EI_RSQRT, FInfo))) { - DEBUG(errs() << "AMDIC: " << *CI << " ---> " - << FInfo.getName().c_str() << "(" << *opr0 << ")\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " + << FInfo.getName().c_str() << "(" << *opr0 << ")\n"); Value *nval = CreateCallEx(B,FPExpr, opr0, issqrt ? "__pow2sqrt" : "__pow2rsqrt"); replaceCall(nval); @@ -999,8 +994,9 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, } nval = B.CreateFDiv(cnval, nval, "__1powprod"); } - DEBUG(errs() << "AMDIC: " << *CI << " ---> " - << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0 << ")\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " + << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0 + << ")\n"); replaceCall(nval); return true; } @@ -1137,8 +1133,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, nval = B.CreateBitCast(nval, opr0->getType()); } - DEBUG(errs() << "AMDIC: " << *CI << " ---> " - << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " + << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n"); replaceCall(nval); return true; @@ -1155,8 +1151,7 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B, } int ci_opr1 = (int)CINT->getSExtValue(); if (ci_opr1 == 1) { // rootn(x, 1) = x - DEBUG(errs() << "AMDIC: " << *CI - << " ---> " << *opr0 << "\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << "\n"); replaceCall(opr0); return true; } @@ -1166,7 +1161,7 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B, Module *M = CI->getModule(); if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) { - DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n"); Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2sqrt"); replaceCall(nval); return true; @@ -1175,13 +1170,13 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B, Module *M = CI->getModule(); if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, FInfo))) { - DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n"); Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2cbrt"); replaceCall(nval); return true; } } else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x - DEBUG(errs() << "AMDIC: " << *CI << " ---> 1.0 / " << *opr0 << "\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1.0 / " << *opr0 << "\n"); Value *nval = B.CreateFDiv(ConstantFP::get(opr0->getType(), 1.0), opr0, "__rootn2div"); @@ -1193,7 +1188,8 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B, Module *M = CI->getModule(); if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT, FInfo))) { - DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0 << ")\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0 + << ")\n"); Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2rsqrt"); replaceCall(nval); return true; @@ -1212,22 +1208,22 @@ bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B, ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1); if ((CF0 && CF0->isZero()) || (CF1 && CF1->isZero())) { // fma/mad(a, b, c) = c if a=0 || b=0 - DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr2 << "\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr2 << "\n"); replaceCall(opr2); return true; } if (CF0 && CF0->isExactlyValue(1.0f)) { // fma/mad(a, b, c) = b+c if a=1 - DEBUG(errs() << "AMDIC: " << *CI << " ---> " - << *opr1 << " + " << *opr2 << "\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr1 << " + " << *opr2 + << "\n"); Value *nval = B.CreateFAdd(opr1, opr2, "fmaadd"); replaceCall(nval); return true; } if (CF1 && CF1->isExactlyValue(1.0f)) { // fma/mad(a, b, c) = a+c if b=1 - DEBUG(errs() << "AMDIC: " << *CI << " ---> " - << *opr0 << " + " << *opr2 << "\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " + " << *opr2 + << "\n"); Value *nval = B.CreateFAdd(opr0, opr2, "fmaadd"); replaceCall(nval); return true; @@ -1235,8 +1231,8 @@ bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B, if (ConstantFP *CF = dyn_cast<ConstantFP>(opr2)) { if (CF->isZero()) { // fma/mad(a, b, c) = a*b if c=0 - DEBUG(errs() << "AMDIC: " << *CI << " ---> " - << *opr0 << " * " << *opr1 << "\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " * " + << *opr1 << "\n"); Value *nval = B.CreateFMul(opr0, opr1, "fmamul"); replaceCall(nval); return true; @@ -1263,8 +1259,8 @@ bool AMDGPULibCalls::fold_sqrt(CallInst *CI, IRBuilder<> &B, if (Constant *FPExpr = getNativeFunction( CI->getModule(), AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) { Value *opr0 = CI->getArgOperand(0); - DEBUG(errs() << "AMDIC: " << *CI << " ---> " - << "sqrt(" << *opr0 << ")\n"); + LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " + << "sqrt(" << *opr0 << ")\n"); Value *nval = CreateCallEx(B,FPExpr, opr0, "__sqrt"); replaceCall(nval); return true; @@ -1355,8 +1351,8 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B, P = B.CreateAddrSpaceCast(Alloc, PTy); CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P); - DEBUG(errs() << "AMDIC: fold_sincos (" << *CI << ", " << *UI - << ") with " << *Call << "\n"); + LLVM_DEBUG(errs() << "AMDIC: fold_sincos (" << *CI << ", " << *UI << ") with " + << *Call << "\n"); if (!isSin) { // CI->cos, UI->sin B.SetInsertPoint(&*ItOld); @@ -1719,9 +1715,8 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) { bool Changed = false; auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - DEBUG(dbgs() << "AMDIC: process function "; - F.printAsOperand(dbgs(), false, F.getParent()); - dbgs() << '\n';); + LLVM_DEBUG(dbgs() << "AMDIC: process function "; + F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';); if (!EnablePreLink) Changed |= setFastFlags(F, Options); @@ -1737,8 +1732,8 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) { Function *Callee = CI->getCalledFunction(); if (Callee == 0) continue; - DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n"; - dbgs().flush()); + LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n"; + dbgs().flush()); if(Simplifier.fold(CI, AA)) Changed = true; } diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.h b/lib/Target/AMDGPU/AMDGPULibFunc.h index 5405bc645714..fe062384800a 100644 --- a/lib/Target/AMDGPU/AMDGPULibFunc.h +++ b/lib/Target/AMDGPU/AMDGPULibFunc.h @@ -1,4 +1,4 @@ -//===-- AMDGPULibFunc.h ---------------------------------------------------===// +//===-- AMDGPULibFunc.h ----------------------------------------*- C++ -*--===// // // The LLVM Compiler Infrastructure // diff --git a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp index 7e0e9802c0e6..2cec8fe53283 100644 --- a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -117,7 +117,6 @@ bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const { return false; const TargetMachine &TM = TPC->getTM<TargetMachine>(); - const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(F); bool Changed = false; for (auto *U : F.users()) { @@ -125,7 +124,7 @@ bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const { if (!CI) continue; - Changed |= ST.makeLIDRangeMetadata(CI); + Changed |= AMDGPUSubtarget::get(TM, F).makeLIDRangeMetadata(CI); } return Changed; } diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp new file mode 100644 index 000000000000..8cc7e38f7b29 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -0,0 +1,264 @@ +//===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This pass replaces accesses to kernel arguments with loads from +/// offsets from the kernarg base pointer. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" + +#define DEBUG_TYPE "amdgpu-lower-kernel-arguments" + +using namespace llvm; + +namespace { + +class AMDGPULowerKernelArguments : public FunctionPass{ +public: + static char ID; + + AMDGPULowerKernelArguments() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetPassConfig>(); + AU.setPreservesAll(); + } +}; + +} // end anonymous namespace + +bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { + CallingConv::ID CC = F.getCallingConv(); + if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty()) + return false; + + auto &TPC = getAnalysis<TargetPassConfig>(); + + const TargetMachine &TM = TPC.getTM<TargetMachine>(); + const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); + LLVMContext &Ctx = F.getParent()->getContext(); + const DataLayout &DL = F.getParent()->getDataLayout(); + BasicBlock &EntryBlock = *F.begin(); + IRBuilder<> Builder(&*EntryBlock.begin()); + + const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary + const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F); + + unsigned MaxAlign; + // FIXME: Alignment is broken broken with explicit arg offset.; + const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign); + if (TotalKernArgSize == 0) + return false; + + CallInst *KernArgSegment = + Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, nullptr, + F.getName() + ".kernarg.segment"); + + KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); + KernArgSegment->addAttribute(AttributeList::ReturnIndex, + Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); + + unsigned AS = KernArgSegment->getType()->getPointerAddressSpace(); + uint64_t ExplicitArgOffset = 0; + + for (Argument &Arg : F.args()) { + Type *ArgTy = Arg.getType(); + unsigned Align = DL.getABITypeAlignment(ArgTy); + unsigned Size = DL.getTypeSizeInBits(ArgTy); + unsigned AllocSize = DL.getTypeAllocSize(ArgTy); + + + // Clover seems to always pad i8/i16 to i32, but doesn't properly align + // them? + // Make sure the struct elements have correct size and alignment for ext + // args. These seem to be padded up to 4-bytes but not correctly aligned. + bool IsExtArg = AllocSize < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) && + !ST.isAmdHsaOS(); + if (IsExtArg) + AllocSize = 4; + + uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset; + ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize; + + if (Arg.use_empty()) + continue; + + if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) { + // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing + // modes on SI to know the high bits are 0 so pointer adds don't wrap. We + // can't represent this with range metadata because it's only allowed for + // integer types. + if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) + continue; + + // FIXME: We can replace this with equivalent alias.scope/noalias + // metadata, but this appears to be a lot of work. + if (Arg.hasNoAliasAttr()) + continue; + } + + VectorType *VT = dyn_cast<VectorType>(ArgTy); + bool IsV3 = VT && VT->getNumElements() == 3; + VectorType *V4Ty = nullptr; + + int64_t AlignDownOffset = alignDown(EltOffset, 4); + int64_t OffsetDiff = EltOffset - AlignDownOffset; + unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset); + + Value *ArgPtr; + if (Size < 32 && !ArgTy->isAggregateType()) { // FIXME: Handle aggregate types + // Since we don't have sub-dword scalar loads, avoid doing an extload by + // loading earlier than the argument address, and extracting the relevant + // bits. + // + // Additionally widen any sub-dword load to i32 even if suitably aligned, + // so that CSE between different argument loads works easily. + + ArgPtr = Builder.CreateConstInBoundsGEP1_64( + KernArgSegment, + AlignDownOffset, + Arg.getName() + ".kernarg.offset.align.down"); + ArgPtr = Builder.CreateBitCast(ArgPtr, + Builder.getInt32Ty()->getPointerTo(AS), + ArgPtr->getName() + ".cast"); + } else { + ArgPtr = Builder.CreateConstInBoundsGEP1_64( + KernArgSegment, + AlignDownOffset, + Arg.getName() + ".kernarg.offset"); + ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS), + ArgPtr->getName() + ".cast"); + } + + assert((!IsExtArg || !IsV3) && "incompatible situation"); + + if (IsV3 && Size >= 32) { + V4Ty = VectorType::get(VT->getVectorElementType(), 4); + // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads + ArgPtr = Builder.CreateBitCast(ArgPtr, V4Ty->getPointerTo(AS)); + } + + LoadInst *Load = Builder.CreateAlignedLoad(ArgPtr, AdjustedAlign); + Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {})); + + MDBuilder MDB(Ctx); + + if (isa<PointerType>(ArgTy)) { + if (Arg.hasNonNullAttr()) + Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {})); + + uint64_t DerefBytes = Arg.getDereferenceableBytes(); + if (DerefBytes != 0) { + Load->setMetadata( + LLVMContext::MD_dereferenceable, + MDNode::get(Ctx, + MDB.createConstant( + ConstantInt::get(Builder.getInt64Ty(), DerefBytes)))); + } + + uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes(); + if (DerefOrNullBytes != 0) { + Load->setMetadata( + LLVMContext::MD_dereferenceable_or_null, + MDNode::get(Ctx, + MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(), + DerefOrNullBytes)))); + } + + unsigned ParamAlign = Arg.getParamAlignment(); + if (ParamAlign != 0) { + Load->setMetadata( + LLVMContext::MD_align, + MDNode::get(Ctx, + MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(), + ParamAlign)))); + } + } + + // TODO: Convert noalias arg to !noalias + + if (Size < 32 && !ArgTy->isAggregateType()) { + if (IsExtArg && OffsetDiff == 0) { + Type *I32Ty = Builder.getInt32Ty(); + bool IsSext = Arg.hasSExtAttr(); + Metadata *LowAndHigh[] = { + ConstantAsMetadata::get( + ConstantInt::get(I32Ty, IsSext ? minIntN(Size) : 0)), + ConstantAsMetadata::get( + ConstantInt::get(I32Ty, + IsSext ? maxIntN(Size) + 1 : maxUIntN(Size) + 1)) + }; + + Load->setMetadata(LLVMContext::MD_range, MDNode::get(Ctx, LowAndHigh)); + } + + Value *ExtractBits = OffsetDiff == 0 ? + Load : Builder.CreateLShr(Load, OffsetDiff * 8); + + IntegerType *ArgIntTy = Builder.getIntNTy(Size); + Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy); + Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy, + Arg.getName() + ".load"); + Arg.replaceAllUsesWith(NewVal); + } else if (IsV3) { + Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty), + {0, 1, 2}, + Arg.getName() + ".load"); + Arg.replaceAllUsesWith(Shuf); + } else { + Load->setName(Arg.getName() + ".load"); + Arg.replaceAllUsesWith(Load); + } + } + + KernArgSegment->addAttribute( + AttributeList::ReturnIndex, + Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); + + return true; +} + +INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE, + "AMDGPU Lower Kernel Arguments", false, false) +INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments", + false, false) + +char AMDGPULowerKernelArguments::ID = 0; + +FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() { + return new AMDGPULowerKernelArguments(); +} diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp new file mode 100644 index 000000000000..a43dcef4cf0b --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp @@ -0,0 +1,270 @@ +//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This pass does attempts to make use of reqd_work_group_size metadata +/// to eliminate loads from the dispatch packet and to constant fold OpenCL +/// get_local_size-like functions. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Pass.h" + +#define DEBUG_TYPE "amdgpu-lower-kernel-attributes" + +using namespace llvm; + +namespace { + +// Field offsets in hsa_kernel_dispatch_packet_t. +enum DispatchPackedOffsets { + WORKGROUP_SIZE_X = 4, + WORKGROUP_SIZE_Y = 6, + WORKGROUP_SIZE_Z = 8, + + GRID_SIZE_X = 12, + GRID_SIZE_Y = 16, + GRID_SIZE_Z = 20 +}; + +class AMDGPULowerKernelAttributes : public ModulePass { + Module *Mod = nullptr; + +public: + static char ID; + + AMDGPULowerKernelAttributes() : ModulePass(ID) {} + + bool processUse(CallInst *CI); + + bool doInitialization(Module &M) override; + bool runOnModule(Module &M) override; + + StringRef getPassName() const override { + return "AMDGPU Kernel Attributes"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } +}; + +} // end anonymous namespace + +bool AMDGPULowerKernelAttributes::doInitialization(Module &M) { + Mod = &M; + return false; +} + +bool AMDGPULowerKernelAttributes::processUse(CallInst *CI) { + Function *F = CI->getParent()->getParent(); + + auto MD = F->getMetadata("reqd_work_group_size"); + const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3; + + const bool HasUniformWorkGroupSize = + F->getFnAttribute("uniform-work-group-size").getValueAsString() == "true"; + + if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize) + return false; + + Value *WorkGroupSizeX = nullptr; + Value *WorkGroupSizeY = nullptr; + Value *WorkGroupSizeZ = nullptr; + + Value *GridSizeX = nullptr; + Value *GridSizeY = nullptr; + Value *GridSizeZ = nullptr; + + const DataLayout &DL = Mod->getDataLayout(); + + // We expect to see several GEP users, casted to the appropriate type and + // loaded. + for (User *U : CI->users()) { + if (!U->hasOneUse()) + continue; + + int64_t Offset = 0; + if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI) + continue; + + auto *BCI = dyn_cast<BitCastInst>(*U->user_begin()); + if (!BCI || !BCI->hasOneUse()) + continue; + + auto *Load = dyn_cast<LoadInst>(*BCI->user_begin()); + if (!Load || !Load->isSimple()) + continue; + + unsigned LoadSize = DL.getTypeStoreSize(Load->getType()); + + // TODO: Handle merged loads. + switch (Offset) { + case WORKGROUP_SIZE_X: + if (LoadSize == 2) + WorkGroupSizeX = Load; + break; + case WORKGROUP_SIZE_Y: + if (LoadSize == 2) + WorkGroupSizeY = Load; + break; + case WORKGROUP_SIZE_Z: + if (LoadSize == 2) + WorkGroupSizeZ = Load; + break; + case GRID_SIZE_X: + if (LoadSize == 4) + GridSizeX = Load; + break; + case GRID_SIZE_Y: + if (LoadSize == 4) + GridSizeY = Load; + break; + case GRID_SIZE_Z: + if (LoadSize == 4) + GridSizeZ = Load; + break; + default: + break; + } + } + + // Pattern match the code used to handle partial workgroup dispatches in the + // library implementation of get_local_size, so the entire function can be + // constant folded with a known group size. + // + // uint r = grid_size - group_id * group_size; + // get_local_size = (r < group_size) ? r : group_size; + // + // If we have uniform-work-group-size (which is the default in OpenCL 1.2), + // the grid_size is required to be a multiple of group_size). In this case: + // + // grid_size - (group_id * group_size) < group_size + // -> + // grid_size < group_size + (group_id * group_size) + // + // (grid_size / group_size) < 1 + group_id + // + // grid_size / group_size is at least 1, so we can conclude the select + // condition is false (except for group_id == 0, where the select result is + // the same). + + bool MadeChange = false; + Value *WorkGroupSizes[3] = { WorkGroupSizeX, WorkGroupSizeY, WorkGroupSizeZ }; + Value *GridSizes[3] = { GridSizeX, GridSizeY, GridSizeZ }; + + for (int I = 0; HasUniformWorkGroupSize && I < 3; ++I) { + Value *GroupSize = WorkGroupSizes[I]; + Value *GridSize = GridSizes[I]; + if (!GroupSize || !GridSize) + continue; + + for (User *U : GroupSize->users()) { + auto *ZextGroupSize = dyn_cast<ZExtInst>(U); + if (!ZextGroupSize) + continue; + + for (User *ZextUser : ZextGroupSize->users()) { + auto *SI = dyn_cast<SelectInst>(ZextUser); + if (!SI) + continue; + + using namespace llvm::PatternMatch; + auto GroupIDIntrin = I == 0 ? + m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>() : + (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>() : + m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>()); + + auto SubExpr = m_Sub(m_Specific(GridSize), + m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))); + + ICmpInst::Predicate Pred; + if (match(SI, + m_Select(m_ICmp(Pred, SubExpr, m_Specific(ZextGroupSize)), + SubExpr, + m_Specific(ZextGroupSize))) && + Pred == ICmpInst::ICMP_ULT) { + if (HasReqdWorkGroupSize) { + ConstantInt *KnownSize + = mdconst::extract<ConstantInt>(MD->getOperand(I)); + SI->replaceAllUsesWith(ConstantExpr::getIntegerCast(KnownSize, + SI->getType(), + false)); + } else { + SI->replaceAllUsesWith(ZextGroupSize); + } + + MadeChange = true; + } + } + } + } + + if (!HasReqdWorkGroupSize) + return MadeChange; + + // Eliminate any other loads we can from the dispatch packet. + for (int I = 0; I < 3; ++I) { + Value *GroupSize = WorkGroupSizes[I]; + if (!GroupSize) + continue; + + ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I)); + GroupSize->replaceAllUsesWith( + ConstantExpr::getIntegerCast(KnownSize, + GroupSize->getType(), + false)); + MadeChange = true; + } + + return MadeChange; +} + +// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get +// TargetPassConfig for subtarget. +bool AMDGPULowerKernelAttributes::runOnModule(Module &M) { + StringRef DispatchPtrName + = Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr); + + Function *DispatchPtr = Mod->getFunction(DispatchPtrName); + if (!DispatchPtr) // Dispatch ptr not used. + return false; + + bool MadeChange = false; + + SmallPtrSet<Instruction *, 4> HandledUses; + for (auto *U : DispatchPtr->users()) { + CallInst *CI = cast<CallInst>(U); + if (HandledUses.insert(CI).second) { + if (processUse(CI)) + MadeChange = true; + } + } + + return MadeChange; +} + +INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE, + "AMDGPU IR optimizations", false, false) +INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, "AMDGPU IR optimizations", + false, false) + +char AMDGPULowerKernelAttributes::ID = 0; + +ModulePass *llvm::createAMDGPULowerKernelAttributesPass() { + return new AMDGPULowerKernelAttributes(); +} diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 23fd8113932c..1876dc3f7122 100644 --- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -8,16 +8,17 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst. +/// Code to lower AMDGPU MachineInstrs to their corresponding MCInst. // //===----------------------------------------------------------------------===// // -#include "AMDGPUMCInstLower.h" #include "AMDGPUAsmPrinter.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "InstPrinter/AMDGPUInstPrinter.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "R600AsmPrinter.h" #include "SIInstrInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineInstr.h" @@ -36,9 +37,43 @@ using namespace llvm; +namespace { + +class AMDGPUMCInstLower { + MCContext &Ctx; + const TargetSubtargetInfo &ST; + const AsmPrinter &AP; + + const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB, + const MachineOperand &MO) const; + +public: + AMDGPUMCInstLower(MCContext &ctx, const TargetSubtargetInfo &ST, + const AsmPrinter &AP); + + bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const; + + /// Lower a MachineInstr to an MCInst + void lower(const MachineInstr *MI, MCInst &OutMI) const; + +}; + +class R600MCInstLower : public AMDGPUMCInstLower { +public: + R600MCInstLower(MCContext &ctx, const R600Subtarget &ST, + const AsmPrinter &AP); + + /// Lower a MachineInstr to an MCInst + void lower(const MachineInstr *MI, MCInst &OutMI) const; +}; + + +} // End anonymous namespace + #include "AMDGPUGenMCPseudoLowering.inc" -AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st, +AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, + const TargetSubtargetInfo &st, const AsmPrinter &ap): Ctx(ctx), ST(st), AP(ap) { } @@ -129,7 +164,7 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO, void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { unsigned Opcode = MI->getOpcode(); - const auto *TII = ST.getInstrInfo(); + const auto *TII = static_cast<const SIInstrInfo*>(ST.getInstrInfo()); // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We // need to select it to the subtarget specific version, and there's no way to @@ -169,16 +204,18 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const { - const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>(); + const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>(); AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this); return MCInstLowering.lowerOperand(MO, MCOp); } -const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) { +static const MCExpr *lowerAddrSpaceCast(const TargetMachine &TM, + const Constant *CV, + MCContext &OutContext) { // TargetMachine does not support llvm-style cast. Use C++-style cast. // This is safe since TM is always of type AMDGPUTargetMachine or its // derived class. - auto *AT = static_cast<AMDGPUTargetMachine*>(&TM); + auto &AT = static_cast<const AMDGPUTargetMachine&>(TM); auto *CE = dyn_cast<ConstantExpr>(CV); // Lower null pointers in private and local address space. @@ -187,12 +224,18 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) { if (CE && CE->getOpcode() == Instruction::AddrSpaceCast) { auto Op = CE->getOperand(0); auto SrcAddr = Op->getType()->getPointerAddressSpace(); - if (Op->isNullValue() && AT->getNullPointerValue(SrcAddr) == 0) { + if (Op->isNullValue() && AT.getNullPointerValue(SrcAddr) == 0) { auto DstAddr = CE->getType()->getPointerAddressSpace(); - return MCConstantExpr::create(AT->getNullPointerValue(DstAddr), + return MCConstantExpr::create(AT.getNullPointerValue(DstAddr), OutContext); } } + return nullptr; +} + +const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) { + if (const MCExpr *E = lowerAddrSpaceCast(TM, CV, OutContext)) + return E; return AsmPrinter::lowerConstant(CV); } @@ -200,7 +243,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; - const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>(); + const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>(); AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this); StringRef Err; @@ -292,3 +335,47 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { } } } + +R600MCInstLower::R600MCInstLower(MCContext &Ctx, const R600Subtarget &ST, + const AsmPrinter &AP) : + AMDGPUMCInstLower(Ctx, ST, AP) { } + +void R600MCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { + OutMI.setOpcode(MI->getOpcode()); + for (const MachineOperand &MO : MI->explicit_operands()) { + MCOperand MCOp; + lowerOperand(MO, MCOp); + OutMI.addOperand(MCOp); + } +} + +void R600AsmPrinter::EmitInstruction(const MachineInstr *MI) { + const R600Subtarget &STI = MF->getSubtarget<R600Subtarget>(); + R600MCInstLower MCInstLowering(OutContext, STI, *this); + + StringRef Err; + if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) { + LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext(); + C.emitError("Illegal instruction detected: " + Err); + MI->print(errs()); + } + + if (MI->isBundle()) { + const MachineBasicBlock *MBB = MI->getParent(); + MachineBasicBlock::const_instr_iterator I = ++MI->getIterator(); + while (I != MBB->instr_end() && I->isInsideBundle()) { + EmitInstruction(&*I); + ++I; + } + } else { + MCInst TmpInst; + MCInstLowering.lower(MI, TmpInst); + EmitToStreamer(*OutStreamer, TmpInst); + } +} + +const MCExpr *R600AsmPrinter::lowerConstant(const Constant *CV) { + if (const MCExpr *E = lowerAddrSpaceCast(TM, CV, OutContext)) + return E; + return AsmPrinter::lowerConstant(CV); +} diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/lib/Target/AMDGPU/AMDGPUMCInstLower.h deleted file mode 100644 index 57d2d85daecd..000000000000 --- a/lib/Target/AMDGPU/AMDGPUMCInstLower.h +++ /dev/null @@ -1,46 +0,0 @@ -//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H -#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H - -namespace llvm { - -class AMDGPUSubtarget; -class AsmPrinter; -class MachineBasicBlock; -class MachineInstr; -class MachineOperand; -class MCContext; -class MCExpr; -class MCInst; -class MCOperand; - -class AMDGPUMCInstLower { - MCContext &Ctx; - const AMDGPUSubtarget &ST; - const AsmPrinter &AP; - - const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB, - const MachineOperand &MO) const; - -public: - AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST, - const AsmPrinter &AP); - - bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const; - - /// \brief Lower a MachineInstr to an MCInst - void lower(const MachineInstr *MI, MCInst &OutMI) const; - -}; - -} // End namespace llvm - -#endif diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp index 20918233e447..6f44e2dbb2d5 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp @@ -31,6 +31,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/Config/llvm-config.h" #include "llvm/IR/DebugLoc.h" #include "llvm/Pass.h" #include "llvm/Support/Compiler.h" @@ -658,7 +659,7 @@ RegionMRT *MRT::buildMRT(MachineFunction &MF, continue; } - DEBUG(dbgs() << "Visiting " << printMBBReference(*MBB) << "\n"); + LLVM_DEBUG(dbgs() << "Visiting " << printMBBReference(*MBB) << "\n"); MBBMRT *NewMBB = new MBBMRT(MBB); MachineRegion *Region = RegionInfo->getRegionFor(MBB); @@ -695,18 +696,19 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg, const TargetRegisterInfo *TRI, PHILinearize &PHIInfo) { if (TRI->isVirtualRegister(Reg)) { - DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) << "\n"); + LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) + << "\n"); // If this is a source register to a PHI we are chaining, it // must be live out. if (PHIInfo.isSource(Reg)) { - DEBUG(dbgs() << "Add LiveOut (PHI): " << printReg(Reg, TRI) << "\n"); + LLVM_DEBUG(dbgs() << "Add LiveOut (PHI): " << printReg(Reg, TRI) << "\n"); addLiveOut(Reg); } else { // If this is live out of the MBB for (auto &UI : MRI->use_operands(Reg)) { if (UI.getParent()->getParent() != MBB) { - DEBUG(dbgs() << "Add LiveOut (MBB " << printMBBReference(*MBB) - << "): " << printReg(Reg, TRI) << "\n"); + LLVM_DEBUG(dbgs() << "Add LiveOut (MBB " << printMBBReference(*MBB) + << "): " << printReg(Reg, TRI) << "\n"); addLiveOut(Reg); } else { // If the use is in the same MBB we have to make sure @@ -717,8 +719,8 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg, MIE = UseInstr->getParent()->instr_end(); MII != MIE; ++MII) { if ((&(*MII)) == DefInstr) { - DEBUG(dbgs() << "Add LiveOut (Loop): " << printReg(Reg, TRI) - << "\n"); + LLVM_DEBUG(dbgs() << "Add LiveOut (Loop): " << printReg(Reg, TRI) + << "\n"); addLiveOut(Reg); } } @@ -734,11 +736,12 @@ void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg, const TargetRegisterInfo *TRI, PHILinearize &PHIInfo) { if (TRI->isVirtualRegister(Reg)) { - DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) << "\n"); + LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) + << "\n"); for (auto &UI : MRI->use_operands(Reg)) { if (!Region->contains(UI.getParent()->getParent())) { - DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region - << "): " << printReg(Reg, TRI) << "\n"); + LLVM_DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region + << "): " << printReg(Reg, TRI) << "\n"); addLiveOut(Reg); } } @@ -749,8 +752,8 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI, const TargetRegisterInfo *TRI, PHILinearize &PHIInfo) { - DEBUG(dbgs() << "-Store Live Outs Begin (" << printMBBReference(*MBB) - << ")-\n"); + LLVM_DEBUG(dbgs() << "-Store Live Outs Begin (" << printMBBReference(*MBB) + << ")-\n"); for (auto &II : *MBB) { for (auto &RI : II.defs()) { storeLiveOutReg(MBB, RI.getReg(), RI.getParent(), MRI, TRI, PHIInfo); @@ -774,9 +777,10 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB, for (int i = 0; i < numPreds; ++i) { if (getPHIPred(PHI, i) == MBB) { unsigned PHIReg = getPHISourceReg(PHI, i); - DEBUG(dbgs() << "Add LiveOut (PhiSource " << printMBBReference(*MBB) - << " -> " << printMBBReference(*(*SI)) - << "): " << printReg(PHIReg, TRI) << "\n"); + LLVM_DEBUG(dbgs() + << "Add LiveOut (PhiSource " << printMBBReference(*MBB) + << " -> " << printMBBReference(*(*SI)) + << "): " << printReg(PHIReg, TRI) << "\n"); addLiveOut(PHIReg); } } @@ -784,7 +788,7 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB, } } - DEBUG(dbgs() << "-Store Live Outs Endn-\n"); + LLVM_DEBUG(dbgs() << "-Store Live Outs Endn-\n"); } void LinearizedRegion::storeMBBLiveOuts(MachineBasicBlock *MBB, @@ -844,8 +848,8 @@ void LinearizedRegion::storeLiveOuts(RegionMRT *Region, for (int i = 0; i < numPreds; ++i) { if (Region->contains(getPHIPred(PHI, i))) { unsigned PHIReg = getPHISourceReg(PHI, i); - DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region - << "): " << printReg(PHIReg, TRI) << "\n"); + LLVM_DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region + << "): " << printReg(PHIReg, TRI) << "\n"); addLiveOut(PHIReg); } } @@ -909,20 +913,21 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister, bool IncludeLoopPHI) { assert(Register != NewRegister && "Cannot replace a reg with itself"); - DEBUG(dbgs() << "Pepareing to replace register (region): " - << printReg(Register, MRI->getTargetRegisterInfo()) << " with " - << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n"); + LLVM_DEBUG( + dbgs() << "Pepareing to replace register (region): " + << printReg(Register, MRI->getTargetRegisterInfo()) << " with " + << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n"); // If we are replacing outside, we also need to update the LiveOuts if (ReplaceOutside && (isLiveOut(Register) || this->getParent()->isLiveOut(Register))) { LinearizedRegion *Current = this; while (Current != nullptr && Current->getEntry() != nullptr) { - DEBUG(dbgs() << "Region before register replace\n"); - DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo())); + LLVM_DEBUG(dbgs() << "Region before register replace\n"); + LLVM_DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo())); Current->replaceLiveOut(Register, NewRegister); - DEBUG(dbgs() << "Region after register replace\n"); - DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo())); + LLVM_DEBUG(dbgs() << "Region after register replace\n"); + LLVM_DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo())); Current = Current->getParent(); } } @@ -946,16 +951,16 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister, if (ShouldReplace) { if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) { - DEBUG(dbgs() << "Trying to substitute physical register: " - << printReg(NewRegister, MRI->getTargetRegisterInfo()) - << "\n"); + LLVM_DEBUG(dbgs() << "Trying to substitute physical register: " + << printReg(NewRegister, MRI->getTargetRegisterInfo()) + << "\n"); llvm_unreachable("Cannot substitute physical registers"); } else { - DEBUG(dbgs() << "Replacing register (region): " - << printReg(Register, MRI->getTargetRegisterInfo()) - << " with " - << printReg(NewRegister, MRI->getTargetRegisterInfo()) - << "\n"); + LLVM_DEBUG(dbgs() << "Replacing register (region): " + << printReg(Register, MRI->getTargetRegisterInfo()) + << " with " + << printReg(NewRegister, MRI->getTargetRegisterInfo()) + << "\n"); O.setReg(NewRegister); } } @@ -1022,18 +1027,18 @@ void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) { if (hasNoDef(Reg, MRI)) continue; if (!MRI->hasOneDef(Reg)) { - DEBUG(this->getEntry()->getParent()->dump()); - DEBUG(dbgs() << printReg(Reg, TRI) << "\n"); + LLVM_DEBUG(this->getEntry()->getParent()->dump()); + LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << "\n"); } if (MRI->def_begin(Reg) == MRI->def_end()) { - DEBUG(dbgs() << "Register " - << printReg(Reg, MRI->getTargetRegisterInfo()) - << " has NO defs\n"); + LLVM_DEBUG(dbgs() << "Register " + << printReg(Reg, MRI->getTargetRegisterInfo()) + << " has NO defs\n"); } else if (!MRI->hasOneDef(Reg)) { - DEBUG(dbgs() << "Register " - << printReg(Reg, MRI->getTargetRegisterInfo()) - << " has multiple defs\n"); + LLVM_DEBUG(dbgs() << "Register " + << printReg(Reg, MRI->getTargetRegisterInfo()) + << " has multiple defs\n"); } assert(MRI->hasOneDef(Reg) && "Register has multiple definitions"); @@ -1041,8 +1046,8 @@ void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) { MachineOperand *UseOperand = &(RI); bool UseIsOutsideDefMBB = Def->getParent()->getParent() != MBB; if (UseIsOutsideDefMBB && UseOperand->isKill()) { - DEBUG(dbgs() << "Removing kill flag on register: " - << printReg(Reg, TRI) << "\n"); + LLVM_DEBUG(dbgs() << "Removing kill flag on register: " + << printReg(Reg, TRI) << "\n"); UseOperand->setIsKill(false); } } @@ -1415,8 +1420,8 @@ void AMDGPUMachineCFGStructurizer::extractKilledPHIs(MachineBasicBlock *MBB) { MachineInstr &Instr = *I; if (Instr.isPHI()) { unsigned PHIDestReg = getPHIDestReg(Instr); - DEBUG(dbgs() << "Extractking killed phi:\n"); - DEBUG(Instr.dump()); + LLVM_DEBUG(dbgs() << "Extractking killed phi:\n"); + LLVM_DEBUG(Instr.dump()); PHIs.insert(&Instr); PHIInfo.addDest(PHIDestReg, Instr.getDebugLoc()); storePHILinearizationInfoDest(PHIDestReg, Instr); @@ -1448,9 +1453,10 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI, MachineBasicBlock *SourceMBB, SmallVector<unsigned, 2> &PHIIndices, unsigned *ReplaceReg) { - DEBUG(dbgs() << "Shrink PHI: "); - DEBUG(PHI.dump()); - DEBUG(dbgs() << " to " << printReg(getPHIDestReg(PHI), TRI) << " = PHI("); + LLVM_DEBUG(dbgs() << "Shrink PHI: "); + LLVM_DEBUG(PHI.dump()); + LLVM_DEBUG(dbgs() << " to " << printReg(getPHIDestReg(PHI), TRI) + << " = PHI("); bool Replaced = false; unsigned NumInputs = getPHINumInputs(PHI); @@ -1480,8 +1486,8 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI, if (SourceMBB) { MIB.addReg(CombinedSourceReg); MIB.addMBB(SourceMBB); - DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", " - << printMBBReference(*SourceMBB)); + LLVM_DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", " + << printMBBReference(*SourceMBB)); } for (unsigned i = 0; i < NumInputs; ++i) { @@ -1492,10 +1498,10 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI, MachineBasicBlock *SourcePred = getPHIPred(PHI, i); MIB.addReg(SourceReg); MIB.addMBB(SourcePred); - DEBUG(dbgs() << printReg(SourceReg, TRI) << ", " - << printMBBReference(*SourcePred)); + LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", " + << printMBBReference(*SourcePred)); } - DEBUG(dbgs() << ")\n"); + LLVM_DEBUG(dbgs() << ")\n"); } PHI.eraseFromParent(); return Replaced; @@ -1504,9 +1510,10 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI, void AMDGPUMachineCFGStructurizer::replacePHI( MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *LastMerge, SmallVector<unsigned, 2> &PHIRegionIndices) { - DEBUG(dbgs() << "Replace PHI: "); - DEBUG(PHI.dump()); - DEBUG(dbgs() << " with " << printReg(getPHIDestReg(PHI), TRI) << " = PHI("); + LLVM_DEBUG(dbgs() << "Replace PHI: "); + LLVM_DEBUG(PHI.dump()); + LLVM_DEBUG(dbgs() << " with " << printReg(getPHIDestReg(PHI), TRI) + << " = PHI("); bool HasExternalEdge = false; unsigned NumInputs = getPHINumInputs(PHI); @@ -1523,8 +1530,8 @@ void AMDGPUMachineCFGStructurizer::replacePHI( getPHIDestReg(PHI)); MIB.addReg(CombinedSourceReg); MIB.addMBB(LastMerge); - DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", " - << printMBBReference(*LastMerge)); + LLVM_DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", " + << printMBBReference(*LastMerge)); for (unsigned i = 0; i < NumInputs; ++i) { if (isPHIRegionIndex(PHIRegionIndices, i)) { continue; @@ -1533,10 +1540,10 @@ void AMDGPUMachineCFGStructurizer::replacePHI( MachineBasicBlock *SourcePred = getPHIPred(PHI, i); MIB.addReg(SourceReg); MIB.addMBB(SourcePred); - DEBUG(dbgs() << printReg(SourceReg, TRI) << ", " - << printMBBReference(*SourcePred)); + LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", " + << printMBBReference(*SourcePred)); } - DEBUG(dbgs() << ")\n"); + LLVM_DEBUG(dbgs() << ")\n"); } else { replaceRegisterWith(getPHIDestReg(PHI), CombinedSourceReg); } @@ -1546,9 +1553,9 @@ void AMDGPUMachineCFGStructurizer::replacePHI( void AMDGPUMachineCFGStructurizer::replaceEntryPHI( MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *IfMBB, SmallVector<unsigned, 2> &PHIRegionIndices) { - DEBUG(dbgs() << "Replace entry PHI: "); - DEBUG(PHI.dump()); - DEBUG(dbgs() << " with "); + LLVM_DEBUG(dbgs() << "Replace entry PHI: "); + LLVM_DEBUG(PHI.dump()); + LLVM_DEBUG(dbgs() << " with "); unsigned NumInputs = getPHINumInputs(PHI); unsigned NumNonRegionInputs = NumInputs; @@ -1561,18 +1568,19 @@ void AMDGPUMachineCFGStructurizer::replaceEntryPHI( if (NumNonRegionInputs == 0) { auto DestReg = getPHIDestReg(PHI); replaceRegisterWith(DestReg, CombinedSourceReg); - DEBUG(dbgs() << " register " << printReg(CombinedSourceReg, TRI) << "\n"); + LLVM_DEBUG(dbgs() << " register " << printReg(CombinedSourceReg, TRI) + << "\n"); PHI.eraseFromParent(); } else { - DEBUG(dbgs() << printReg(getPHIDestReg(PHI), TRI) << " = PHI("); + LLVM_DEBUG(dbgs() << printReg(getPHIDestReg(PHI), TRI) << " = PHI("); MachineBasicBlock *MBB = PHI.getParent(); MachineInstrBuilder MIB = BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI), getPHIDestReg(PHI)); MIB.addReg(CombinedSourceReg); MIB.addMBB(IfMBB); - DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", " - << printMBBReference(*IfMBB)); + LLVM_DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", " + << printMBBReference(*IfMBB)); unsigned NumInputs = getPHINumInputs(PHI); for (unsigned i = 0; i < NumInputs; ++i) { if (isPHIRegionIndex(PHIRegionIndices, i)) { @@ -1582,10 +1590,10 @@ void AMDGPUMachineCFGStructurizer::replaceEntryPHI( MachineBasicBlock *SourcePred = getPHIPred(PHI, i); MIB.addReg(SourceReg); MIB.addMBB(SourcePred); - DEBUG(dbgs() << printReg(SourceReg, TRI) << ", " - << printMBBReference(*SourcePred)); + LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", " + << printMBBReference(*SourcePred)); } - DEBUG(dbgs() << ")\n"); + LLVM_DEBUG(dbgs() << ")\n"); PHI.eraseFromParent(); } } @@ -1607,8 +1615,9 @@ void AMDGPUMachineCFGStructurizer::replaceLiveOutRegs( } } - DEBUG(dbgs() << "Register " << printReg(Reg, TRI) << " is " - << (IsDead ? "dead" : "alive") << " after PHI replace\n"); + LLVM_DEBUG(dbgs() << "Register " << printReg(Reg, TRI) << " is " + << (IsDead ? "dead" : "alive") + << " after PHI replace\n"); if (IsDead) { LRegion->removeLiveOut(Reg); } @@ -1682,8 +1691,8 @@ void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHIs(LinearizedRegion *Regi void AMDGPUMachineCFGStructurizer::insertUnconditionalBranch(MachineBasicBlock *MBB, MachineBasicBlock *Dest, const DebugLoc &DL) { - DEBUG(dbgs() << "Inserting unconditional branch: " << MBB->getNumber() - << " -> " << Dest->getNumber() << "\n"); + LLVM_DEBUG(dbgs() << "Inserting unconditional branch: " << MBB->getNumber() + << " -> " << Dest->getNumber() << "\n"); MachineBasicBlock::instr_iterator Terminator = MBB->getFirstInstrTerminator(); bool HasTerminator = Terminator != MBB->instr_end(); if (HasTerminator) { @@ -1732,7 +1741,8 @@ AMDGPUMachineCFGStructurizer::createLinearizedExitBlock(RegionMRT *Region) { MF->insert(ExitIter, LastMerge); LastMerge->addSuccessor(Exit); insertUnconditionalBranch(LastMerge, Exit); - DEBUG(dbgs() << "Created exit block: " << LastMerge->getNumber() << "\n"); + LLVM_DEBUG(dbgs() << "Created exit block: " << LastMerge->getNumber() + << "\n"); } return LastMerge; } @@ -1748,11 +1758,12 @@ void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB, if (MergeBB->succ_begin() == MergeBB->succ_end()) { return; } - DEBUG(dbgs() << "Merge PHI (" << printMBBReference(*MergeBB) - << "): " << printReg(DestRegister, TRI) << " = PHI(" - << printReg(IfSourceRegister, TRI) << ", " - << printMBBReference(*IfBB) << printReg(CodeSourceRegister, TRI) - << ", " << printMBBReference(*CodeBB) << ")\n"); + LLVM_DEBUG(dbgs() << "Merge PHI (" << printMBBReference(*MergeBB) + << "): " << printReg(DestRegister, TRI) << " = PHI(" + << printReg(IfSourceRegister, TRI) << ", " + << printMBBReference(*IfBB) + << printReg(CodeSourceRegister, TRI) << ", " + << printMBBReference(*CodeBB) << ")\n"); const DebugLoc &DL = MergeBB->findDebugLoc(MergeBB->begin()); MachineInstrBuilder MIB = BuildMI(*MergeBB, MergeBB->instr_begin(), DL, TII->get(TargetOpcode::PHI), DestRegister); @@ -1810,8 +1821,8 @@ static void removeExternalCFGEdges(MachineBasicBlock *StartMBB, for (auto SI : Succs) { std::pair<MachineBasicBlock *, MachineBasicBlock *> Edge = SI; - DEBUG(dbgs() << "Removing edge: " << printMBBReference(*Edge.first) - << " -> " << printMBBReference(*Edge.second) << "\n"); + LLVM_DEBUG(dbgs() << "Removing edge: " << printMBBReference(*Edge.first) + << " -> " << printMBBReference(*Edge.second) << "\n"); Edge.first->removeSuccessor(Edge.second); } } @@ -1844,13 +1855,13 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock( IfBB->addSuccessor(MergeBB); IfBB->addSuccessor(CodeBBStart); - DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n"); + LLVM_DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n"); // Ensure that the MergeBB is a successor of the CodeEndBB. if (!CodeBBEnd->isSuccessor(MergeBB)) CodeBBEnd->addSuccessor(MergeBB); - DEBUG(dbgs() << "Moved " << printMBBReference(*CodeBBStart) << " through " - << printMBBReference(*CodeBBEnd) << "\n"); + LLVM_DEBUG(dbgs() << "Moved " << printMBBReference(*CodeBBStart) + << " through " << printMBBReference(*CodeBBEnd) << "\n"); // If we have a single predecessor we can find a reasonable debug location MachineBasicBlock *SinglePred = @@ -1935,16 +1946,18 @@ void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *Co MachineInstr *AMDGPUMachineCFGStructurizer::getDefInstr(unsigned Reg) { if (MRI->def_begin(Reg) == MRI->def_end()) { - DEBUG(dbgs() << "Register " << printReg(Reg, MRI->getTargetRegisterInfo()) - << " has NO defs\n"); + LLVM_DEBUG(dbgs() << "Register " + << printReg(Reg, MRI->getTargetRegisterInfo()) + << " has NO defs\n"); } else if (!MRI->hasOneDef(Reg)) { - DEBUG(dbgs() << "Register " << printReg(Reg, MRI->getTargetRegisterInfo()) - << " has multiple defs\n"); - DEBUG(dbgs() << "DEFS BEGIN:\n"); + LLVM_DEBUG(dbgs() << "Register " + << printReg(Reg, MRI->getTargetRegisterInfo()) + << " has multiple defs\n"); + LLVM_DEBUG(dbgs() << "DEFS BEGIN:\n"); for (auto DI = MRI->def_begin(Reg), DE = MRI->def_end(); DI != DE; ++DI) { - DEBUG(DI->getParent()->dump()); + LLVM_DEBUG(DI->getParent()->dump()); } - DEBUG(dbgs() << "DEFS END\n"); + LLVM_DEBUG(dbgs() << "DEFS END\n"); } assert(MRI->hasOneDef(Reg) && "Register has multiple definitions"); @@ -1986,7 +1999,7 @@ void AMDGPUMachineCFGStructurizer::insertChainedPHI(MachineBasicBlock *IfBB, const TargetRegisterClass *RegClass = MRI->getRegClass(DestReg); unsigned NextDestReg = MRI->createVirtualRegister(RegClass); bool IsLastDef = PHIInfo.getNumSources(DestReg) == 1; - DEBUG(dbgs() << "Insert Chained PHI\n"); + LLVM_DEBUG(dbgs() << "Insert Chained PHI\n"); insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, DestReg, NextDestReg, SourceReg, IsLastDef); @@ -2022,16 +2035,16 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB, } for (auto LI : OldLiveOuts) { - DEBUG(dbgs() << "LiveOut: " << printReg(LI, TRI)); + LLVM_DEBUG(dbgs() << "LiveOut: " << printReg(LI, TRI)); if (!containsDef(CodeBB, InnerRegion, LI) || (!IsSingleBB && (getDefInstr(LI)->getParent() == LRegion->getExit()))) { // If the register simly lives through the CodeBB, we don't have // to rewrite anything since the register is not defined in this // part of the code. - DEBUG(dbgs() << "- through"); + LLVM_DEBUG(dbgs() << "- through"); continue; } - DEBUG(dbgs() << "\n"); + LLVM_DEBUG(dbgs() << "\n"); unsigned Reg = LI; if (/*!PHIInfo.isSource(Reg) &&*/ Reg != InnerRegion->getBBSelectRegOut()) { // If the register is live out, we do want to create a phi, @@ -2048,12 +2061,12 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB, unsigned IfSourceReg = MRI->createVirtualRegister(RegClass); // Create initializer, this value is never used, but is needed // to satisfy SSA. - DEBUG(dbgs() << "Initializer for reg: " << printReg(Reg) << "\n"); + LLVM_DEBUG(dbgs() << "Initializer for reg: " << printReg(Reg) << "\n"); TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DebugLoc(), IfSourceReg, 0); InnerRegion->replaceRegisterOutsideRegion(Reg, PHIDestReg, true, MRI); - DEBUG(dbgs() << "Insert Non-Chained Live out PHI\n"); + LLVM_DEBUG(dbgs() << "Insert Non-Chained Live out PHI\n"); insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, PHIDestReg, IfSourceReg, Reg, true); } @@ -2063,22 +2076,22 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB, // is a source block for a definition. SmallVector<unsigned, 4> Sources; if (PHIInfo.findSourcesFromMBB(CodeBB, Sources)) { - DEBUG(dbgs() << "Inserting PHI Live Out from " << printMBBReference(*CodeBB) - << "\n"); + LLVM_DEBUG(dbgs() << "Inserting PHI Live Out from " + << printMBBReference(*CodeBB) << "\n"); for (auto SI : Sources) { unsigned DestReg; PHIInfo.findDest(SI, CodeBB, DestReg); insertChainedPHI(IfBB, CodeBB, MergeBB, InnerRegion, DestReg, SI); } - DEBUG(dbgs() << "Insertion done.\n"); + LLVM_DEBUG(dbgs() << "Insertion done.\n"); } - DEBUG(PHIInfo.dump(MRI)); + LLVM_DEBUG(PHIInfo.dump(MRI)); } void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) { - DEBUG(dbgs() << "Before PHI Prune\n"); - DEBUG(PHIInfo.dump(MRI)); + LLVM_DEBUG(dbgs() << "Before PHI Prune\n"); + LLVM_DEBUG(PHIInfo.dump(MRI)); SmallVector<std::tuple<unsigned, unsigned, MachineBasicBlock *>, 4> ElimiatedSources; for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE; @@ -2118,8 +2131,8 @@ void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) { PHIInfo.removeSource(std::get<0>(SourceInfo), std::get<1>(SourceInfo), std::get<2>(SourceInfo)); } - DEBUG(dbgs() << "After PHI Prune\n"); - DEBUG(PHIInfo.dump(MRI)); + LLVM_DEBUG(dbgs() << "After PHI Prune\n"); + LLVM_DEBUG(PHIInfo.dump(MRI)); } void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegion, @@ -2127,8 +2140,8 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio MachineBasicBlock *Entry = CurrentRegion->getEntry(); MachineBasicBlock *Exit = CurrentRegion->getExit(); - DEBUG(dbgs() << "RegionExit: " << Exit->getNumber() - << " Pred: " << (*(Entry->pred_begin()))->getNumber() << "\n"); + LLVM_DEBUG(dbgs() << "RegionExit: " << Exit->getNumber() << " Pred: " + << (*(Entry->pred_begin()))->getNumber() << "\n"); int NumSources = 0; auto SE = PHIInfo.sources_end(DestReg); @@ -2145,7 +2158,7 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio const DebugLoc &DL = Entry->findDebugLoc(Entry->begin()); MachineInstrBuilder MIB = BuildMI(*Entry, Entry->instr_begin(), DL, TII->get(TargetOpcode::PHI), DestReg); - DEBUG(dbgs() << "Entry PHI " << printReg(DestReg, TRI) << " = PHI("); + LLVM_DEBUG(dbgs() << "Entry PHI " << printReg(DestReg, TRI) << " = PHI("); unsigned CurrentBackedgeReg = 0; @@ -2169,19 +2182,19 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio BackedgePHI.addReg(getPHISourceReg(*PHIDefInstr, 1)); BackedgePHI.addMBB((*SRI).second); CurrentBackedgeReg = NewBackedgeReg; - DEBUG(dbgs() << "Inserting backedge PHI: " - << printReg(NewBackedgeReg, TRI) << " = PHI(" - << printReg(CurrentBackedgeReg, TRI) << ", " - << printMBBReference(*getPHIPred(*PHIDefInstr, 0)) - << ", " - << printReg(getPHISourceReg(*PHIDefInstr, 1), TRI) - << ", " << printMBBReference(*(*SRI).second)); + LLVM_DEBUG(dbgs() + << "Inserting backedge PHI: " + << printReg(NewBackedgeReg, TRI) << " = PHI(" + << printReg(CurrentBackedgeReg, TRI) << ", " + << printMBBReference(*getPHIPred(*PHIDefInstr, 0)) << ", " + << printReg(getPHISourceReg(*PHIDefInstr, 1), TRI) << ", " + << printMBBReference(*(*SRI).second)); } } else { MIB.addReg(SourceReg); MIB.addMBB((*SRI).second); - DEBUG(dbgs() << printReg(SourceReg, TRI) << ", " - << printMBBReference(*(*SRI).second) << ", "); + LLVM_DEBUG(dbgs() << printReg(SourceReg, TRI) << ", " + << printMBBReference(*(*SRI).second) << ", "); } } @@ -2189,16 +2202,16 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio if (CurrentBackedgeReg != 0) { MIB.addReg(CurrentBackedgeReg); MIB.addMBB(Exit); - DEBUG(dbgs() << printReg(CurrentBackedgeReg, TRI) << ", " - << printMBBReference(*Exit) << ")\n"); + LLVM_DEBUG(dbgs() << printReg(CurrentBackedgeReg, TRI) << ", " + << printMBBReference(*Exit) << ")\n"); } else { - DEBUG(dbgs() << ")\n"); + LLVM_DEBUG(dbgs() << ")\n"); } } } void AMDGPUMachineCFGStructurizer::createEntryPHIs(LinearizedRegion *CurrentRegion) { - DEBUG(PHIInfo.dump(MRI)); + LLVM_DEBUG(PHIInfo.dump(MRI)); for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE; ++DRI) { @@ -2219,19 +2232,19 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register, MachineOperand &O = *I; ++I; if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) { - DEBUG(dbgs() << "Trying to substitute physical register: " - << printReg(NewRegister, MRI->getTargetRegisterInfo()) - << "\n"); + LLVM_DEBUG(dbgs() << "Trying to substitute physical register: " + << printReg(NewRegister, MRI->getTargetRegisterInfo()) + << "\n"); llvm_unreachable("Cannot substitute physical registers"); // We don't handle physical registers, but if we need to // in the future This is how we do it: // O.substPhysReg(NewRegister, *TRI); } else { - DEBUG(dbgs() << "Replacing register: " - << printReg(Register, MRI->getTargetRegisterInfo()) - << " with " - << printReg(NewRegister, MRI->getTargetRegisterInfo()) - << "\n"); + LLVM_DEBUG(dbgs() << "Replacing register: " + << printReg(Register, MRI->getTargetRegisterInfo()) + << " with " + << printReg(NewRegister, MRI->getTargetRegisterInfo()) + << "\n"); O.setReg(NewRegister); } } @@ -2239,20 +2252,20 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register, getRegionMRT()->replaceLiveOutReg(Register, NewRegister); - DEBUG(PHIInfo.dump(MRI)); + LLVM_DEBUG(PHIInfo.dump(MRI)); } void AMDGPUMachineCFGStructurizer::resolvePHIInfos(MachineBasicBlock *FunctionEntry) { - DEBUG(dbgs() << "Resolve PHI Infos\n"); - DEBUG(PHIInfo.dump(MRI)); + LLVM_DEBUG(dbgs() << "Resolve PHI Infos\n"); + LLVM_DEBUG(PHIInfo.dump(MRI)); for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE; ++DRI) { unsigned DestReg = *DRI; - DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI) << "\n"); + LLVM_DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI) << "\n"); auto SRI = PHIInfo.sources_begin(DestReg); unsigned SourceReg = (*SRI).first; - DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI) - << " SourceReg: " << printReg(SourceReg, TRI) << "\n"); + LLVM_DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI) + << " SourceReg: " << printReg(SourceReg, TRI) << "\n"); assert(PHIInfo.sources_end(DestReg) == ++SRI && "More than one phi source in entry node"); @@ -2326,9 +2339,9 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion( MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true); ArrayRef<MachineOperand> Cond(RegOp); - DEBUG(dbgs() << "RegionExitReg: "); - DEBUG(Cond[0].print(dbgs(), TRI)); - DEBUG(dbgs() << "\n"); + LLVM_DEBUG(dbgs() << "RegionExitReg: "); + LLVM_DEBUG(Cond[0].print(dbgs(), TRI)); + LLVM_DEBUG(dbgs() << "\n"); TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit, Cond, DebugLoc()); RegionExit->addSuccessor(CurrentRegion->getEntry()); @@ -2338,12 +2351,12 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion( LinearizedRegion InnerRegion(CodeBB, MRI, TRI, PHIInfo); InnerRegion.setParent(CurrentRegion); - DEBUG(dbgs() << "Insert BB Select PHI (BB)\n"); + LLVM_DEBUG(dbgs() << "Insert BB Select PHI (BB)\n"); insertMergePHI(IfBB, CodeBB, MergeBB, BBSelectRegOut, BBSelectRegIn, CodeBBSelectReg); InnerRegion.addMBB(MergeBB); - DEBUG(InnerRegion.print(dbgs(), TRI)); + LLVM_DEBUG(InnerRegion.print(dbgs(), TRI)); rewriteLiveOutRegs(IfBB, CodeBB, MergeBB, &InnerRegion, CurrentRegion); extractKilledPHIs(CodeBB); if (IsRegionEntryBB) { @@ -2384,16 +2397,16 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion( CurrentRegion->getRegionMRT()->getEntry()->getNumber()); MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true); ArrayRef<MachineOperand> Cond(RegOp); - DEBUG(dbgs() << "RegionExitReg: "); - DEBUG(Cond[0].print(dbgs(), TRI)); - DEBUG(dbgs() << "\n"); + LLVM_DEBUG(dbgs() << "RegionExitReg: "); + LLVM_DEBUG(Cond[0].print(dbgs(), TRI)); + LLVM_DEBUG(dbgs() << "\n"); TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit, Cond, DebugLoc()); RegionExit->addSuccessor(IfBB); } } CurrentRegion->addMBBs(InnerRegion); - DEBUG(dbgs() << "Insert BB Select PHI (region)\n"); + LLVM_DEBUG(dbgs() << "Insert BB Select PHI (region)\n"); insertMergePHI(IfBB, CodeExitBB, MergeBB, BBSelectRegOut, BBSelectRegIn, CodeBBSelectReg); @@ -2439,15 +2452,16 @@ void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI, MachineInstrBuilder MIB = BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(), TII->get(TargetOpcode::PHI), NewDestReg); - DEBUG(dbgs() << "Split Entry PHI " << printReg(NewDestReg, TRI) << " = PHI("); + LLVM_DEBUG(dbgs() << "Split Entry PHI " << printReg(NewDestReg, TRI) + << " = PHI("); MIB.addReg(PHISource); MIB.addMBB(Entry); - DEBUG(dbgs() << printReg(PHISource, TRI) << ", " - << printMBBReference(*Entry)); + LLVM_DEBUG(dbgs() << printReg(PHISource, TRI) << ", " + << printMBBReference(*Entry)); MIB.addReg(RegionSourceReg); MIB.addMBB(RegionSourceMBB); - DEBUG(dbgs() << " ," << printReg(RegionSourceReg, TRI) << ", " - << printMBBReference(*RegionSourceMBB) << ")\n"); + LLVM_DEBUG(dbgs() << " ," << printReg(RegionSourceReg, TRI) << ", " + << printMBBReference(*RegionSourceMBB) << ")\n"); } void AMDGPUMachineCFGStructurizer::splitLoopPHIs(MachineBasicBlock *Entry, @@ -2480,7 +2494,8 @@ AMDGPUMachineCFGStructurizer::splitExit(LinearizedRegion *LRegion) { LRegion->addMBB(NewExit); LRegion->setExit(NewExit); - DEBUG(dbgs() << "Created new exit block: " << NewExit->getNumber() << "\n"); + LLVM_DEBUG(dbgs() << "Created new exit block: " << NewExit->getNumber() + << "\n"); // Replace any PHI Predecessors in the successor with NewExit for (auto &II : *Succ) { @@ -2528,9 +2543,9 @@ AMDGPUMachineCFGStructurizer::splitEntry(LinearizedRegion *LRegion) { MachineBasicBlock *EntrySucc = split(Entry->getFirstNonPHI()); MachineBasicBlock *Exit = LRegion->getExit(); - DEBUG(dbgs() << "Split " << printMBBReference(*Entry) << " to " - << printMBBReference(*Entry) << " -> " - << printMBBReference(*EntrySucc) << "\n"); + LLVM_DEBUG(dbgs() << "Split " << printMBBReference(*Entry) << " to " + << printMBBReference(*Entry) << " -> " + << printMBBReference(*EntrySucc) << "\n"); LRegion->addMBB(EntrySucc); // Make the backedge go to Entry Succ @@ -2621,21 +2636,21 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) { rewriteRegionExitPHIs(Region, LastMerge, LRegion); removeOldExitPreds(Region); - DEBUG(PHIInfo.dump(MRI)); + LLVM_DEBUG(PHIInfo.dump(MRI)); SetVector<MRT *> *Children = Region->getChildren(); - DEBUG(dbgs() << "===========If Region Start===============\n"); + LLVM_DEBUG(dbgs() << "===========If Region Start===============\n"); if (LRegion->getHasLoop()) { - DEBUG(dbgs() << "Has Backedge: Yes\n"); + LLVM_DEBUG(dbgs() << "Has Backedge: Yes\n"); } else { - DEBUG(dbgs() << "Has Backedge: No\n"); + LLVM_DEBUG(dbgs() << "Has Backedge: No\n"); } unsigned BBSelectRegIn; unsigned BBSelectRegOut; for (auto CI = Children->begin(), CE = Children->end(); CI != CE; ++CI) { - DEBUG(dbgs() << "CurrentRegion: \n"); - DEBUG(LRegion->print(dbgs(), TRI)); + LLVM_DEBUG(dbgs() << "CurrentRegion: \n"); + LLVM_DEBUG(LRegion->print(dbgs(), TRI)); auto CNI = CI; ++CNI; @@ -2649,9 +2664,9 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) { // We found the block is the exit of an inner region, we need // to put it in the current linearized region. - DEBUG(dbgs() << "Linearizing region: "); - DEBUG(InnerLRegion->print(dbgs(), TRI)); - DEBUG(dbgs() << "\n"); + LLVM_DEBUG(dbgs() << "Linearizing region: "); + LLVM_DEBUG(InnerLRegion->print(dbgs(), TRI)); + LLVM_DEBUG(dbgs() << "\n"); MachineBasicBlock *InnerEntry = InnerLRegion->getEntry(); if ((&(*(InnerEntry->getParent()->begin()))) == InnerEntry) { @@ -2669,10 +2684,10 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) { BBSelectRegOut = Child->getBBSelectRegOut(); BBSelectRegIn = Child->getBBSelectRegIn(); - DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI) - << "\n"); - DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI) - << "\n"); + LLVM_DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI) + << "\n"); + LLVM_DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI) + << "\n"); MachineBasicBlock *IfEnd = CurrentMerge; CurrentMerge = createIfRegion(CurrentMerge, InnerLRegion, LRegion, @@ -2681,7 +2696,7 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) { TII->convertNonUniformIfRegion(CurrentMerge, IfEnd); } else { MachineBasicBlock *MBB = Child->getMBBMRT()->getMBB(); - DEBUG(dbgs() << "Linearizing block: " << MBB->getNumber() << "\n"); + LLVM_DEBUG(dbgs() << "Linearizing block: " << MBB->getNumber() << "\n"); if (MBB == getSingleExitNode(*(MBB->getParent()))) { // If this is the exit block then we need to skip to the next. @@ -2693,10 +2708,10 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) { BBSelectRegOut = Child->getBBSelectRegOut(); BBSelectRegIn = Child->getBBSelectRegIn(); - DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI) - << "\n"); - DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI) - << "\n"); + LLVM_DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI) + << "\n"); + LLVM_DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI) + << "\n"); MachineBasicBlock *IfEnd = CurrentMerge; // This is a basic block that is not part of an inner region, we @@ -2707,7 +2722,7 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) { TII->convertNonUniformIfRegion(CurrentMerge, IfEnd); } - DEBUG(PHIInfo.dump(MRI)); + LLVM_DEBUG(PHIInfo.dump(MRI)); } } @@ -2728,7 +2743,7 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) { NewInReg, Region->getEntry()->getNumber()); // Need to be careful about updating the registers inside the region. LRegion->replaceRegisterInsideRegion(InReg, InnerSelectReg, false, MRI); - DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n"); + LLVM_DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n"); insertMergePHI(LRegion->getEntry(), LRegion->getExit(), NewSucc, InnerSelectReg, NewInReg, LRegion->getRegionMRT()->getInnerOutputRegister()); @@ -2740,11 +2755,11 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) { TII->insertReturn(*LastMerge); } - DEBUG(Region->getEntry()->getParent()->dump()); - DEBUG(LRegion->print(dbgs(), TRI)); - DEBUG(PHIInfo.dump(MRI)); + LLVM_DEBUG(Region->getEntry()->getParent()->dump()); + LLVM_DEBUG(LRegion->print(dbgs(), TRI)); + LLVM_DEBUG(PHIInfo.dump(MRI)); - DEBUG(dbgs() << "===========If Region End===============\n"); + LLVM_DEBUG(dbgs() << "===========If Region End===============\n"); Region->setLinearizedRegion(LRegion); return true; @@ -2784,12 +2799,12 @@ bool AMDGPUMachineCFGStructurizer::structurizeRegions(RegionMRT *Region, } void AMDGPUMachineCFGStructurizer::initFallthroughMap(MachineFunction &MF) { - DEBUG(dbgs() << "Fallthrough Map:\n"); + LLVM_DEBUG(dbgs() << "Fallthrough Map:\n"); for (auto &MBBI : MF) { MachineBasicBlock *MBB = MBBI.getFallThrough(); if (MBB != nullptr) { - DEBUG(dbgs() << "Fallthrough: " << MBBI.getNumber() << " -> " - << MBB->getNumber() << "\n"); + LLVM_DEBUG(dbgs() << "Fallthrough: " << MBBI.getNumber() << " -> " + << MBB->getNumber() << "\n"); } FallthroughMap[&MBBI] = MBB; } @@ -2800,8 +2815,8 @@ void AMDGPUMachineCFGStructurizer::createLinearizedRegion(RegionMRT *Region, LinearizedRegion *LRegion = new LinearizedRegion(); if (SelectOut) { LRegion->addLiveOut(SelectOut); - DEBUG(dbgs() << "Add LiveOut (BBSelect): " << printReg(SelectOut, TRI) - << "\n"); + LLVM_DEBUG(dbgs() << "Add LiveOut (BBSelect): " << printReg(SelectOut, TRI) + << "\n"); } LRegion->setRegionMRT(Region); Region->setLinearizedRegion(LRegion); @@ -2856,26 +2871,26 @@ static void checkRegOnlyPHIInputs(MachineFunction &MF) { } bool AMDGPUMachineCFGStructurizer::runOnMachineFunction(MachineFunction &MF) { - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); TRI = ST.getRegisterInfo(); MRI = &(MF.getRegInfo()); initFallthroughMap(MF); checkRegOnlyPHIInputs(MF); - DEBUG(dbgs() << "----STRUCTURIZER START----\n"); - DEBUG(MF.dump()); + LLVM_DEBUG(dbgs() << "----STRUCTURIZER START----\n"); + LLVM_DEBUG(MF.dump()); Regions = &(getAnalysis<MachineRegionInfoPass>().getRegionInfo()); - DEBUG(Regions->dump()); + LLVM_DEBUG(Regions->dump()); RegionMRT *RTree = MRT::buildMRT(MF, Regions, TII, MRI); setRegionMRT(RTree); initializeSelectRegisters(RTree, 0, MRI, TII); - DEBUG(RTree->dump(TRI)); + LLVM_DEBUG(RTree->dump(TRI)); bool result = structurizeRegions(RTree, true); delete RTree; - DEBUG(dbgs() << "----STRUCTURIZER END----\n"); + LLVM_DEBUG(dbgs() << "----STRUCTURIZER END----\n"); initFallthroughMap(MF); return result; } diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index b7c8c1213537..13b4b50149ce 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -9,20 +9,38 @@ #include "AMDGPUMachineFunction.h" #include "AMDGPUSubtarget.h" +#include "AMDGPUPerfHintAnalysis.h" +#include "llvm/CodeGen/MachineModuleInfo.h" using namespace llvm; AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), LocalMemoryObjects(), - KernArgSize(0), + ExplicitKernArgSize(0), MaxKernArgAlign(0), LDSSize(0), - ABIArgOffset(0), IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())), - NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) { + NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath), + MemoryBound(false), + WaveLimiter(false) { + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); + // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset, // except reserved size is not correctly aligned. + const Function &F = MF.getFunction(); + + if (auto *Resolver = MF.getMMI().getResolver()) { + if (AMDGPUPerfHintAnalysis *PHA = static_cast<AMDGPUPerfHintAnalysis*>( + Resolver->getAnalysisIfAvailable(&AMDGPUPerfHintAnalysisID, true))) { + MemoryBound = PHA->isMemoryBound(&F); + WaveLimiter = PHA->needsWaveLimiter(&F); + } + } + + CallingConv::ID CC = F.getCallingConv(); + if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) + ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign); } unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 99bb61b21db0..8d6b871bc03e 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -15,57 +15,43 @@ namespace llvm { +class GCNSubtarget; + class AMDGPUMachineFunction : public MachineFunctionInfo { /// A map to keep track of local memory objects and their offsets within the /// local memory space. SmallDenseMap<const GlobalValue *, unsigned, 4> LocalMemoryObjects; - uint64_t KernArgSize; - unsigned MaxKernArgAlign; +protected: + uint64_t ExplicitKernArgSize; // Cache for this. + unsigned MaxKernArgAlign; // Cache for this. /// Number of bytes in the LDS that are being used. unsigned LDSSize; - // FIXME: This should probably be removed. - /// Start of implicit kernel args - unsigned ABIArgOffset; - - // Kernels + shaders. i.e. functions called by the driver and not not called + // Kernels + shaders. i.e. functions called by the driver and not called // by other functions. bool IsEntryFunction; bool NoSignedZerosFPMath; -public: - AMDGPUMachineFunction(const MachineFunction &MF); - - uint64_t allocateKernArg(uint64_t Size, unsigned Align) { - assert(isPowerOf2_32(Align)); - KernArgSize = alignTo(KernArgSize, Align); + // Function may be memory bound. + bool MemoryBound; - uint64_t Result = KernArgSize; - KernArgSize += Size; + // Kernel may need limited waves per EU for better performance. + bool WaveLimiter; - MaxKernArgAlign = std::max(Align, MaxKernArgAlign); - return Result; - } +public: + AMDGPUMachineFunction(const MachineFunction &MF); - uint64_t getKernArgSize() const { - return KernArgSize; + uint64_t getExplicitKernArgSize() const { + return ExplicitKernArgSize; } unsigned getMaxKernArgAlign() const { return MaxKernArgAlign; } - void setABIArgOffset(unsigned NewOffset) { - ABIArgOffset = NewOffset; - } - - unsigned getABIArgOffset() const { - return ABIArgOffset; - } - unsigned getLDSSize() const { return LDSSize; } @@ -78,6 +64,14 @@ public: return NoSignedZerosFPMath; } + bool isMemoryBound() const { + return MemoryBound; + } + + bool needsWaveLimiter() const { + return WaveLimiter; + } + unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV); }; diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp index 3164140abe29..7b9f673c418c 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief AMDGPU Machine Module Info. +/// AMDGPU Machine Module Info. /// // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h index 1a728c6bd04a..1219ab26fb69 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h +++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief AMDGPU Machine Module Info. +/// AMDGPU Machine Module Info. /// // //===----------------------------------------------------------------------===// @@ -30,14 +30,14 @@ private: // All supported memory/synchronization scopes can be found here: // http://llvm.org/docs/AMDGPUUsage.html#memory-scopes - /// \brief Agent synchronization scope ID. + /// Agent synchronization scope ID. SyncScope::ID AgentSSID; - /// \brief Workgroup synchronization scope ID. + /// Workgroup synchronization scope ID. SyncScope::ID WorkgroupSSID; - /// \brief Wavefront synchronization scope ID. + /// Wavefront synchronization scope ID. SyncScope::ID WavefrontSSID; - /// \brief In AMDGPU target synchronization scopes are inclusive, meaning a + /// In AMDGPU target synchronization scopes are inclusive, meaning a /// larger synchronization scope is inclusive of a smaller synchronization /// scope. /// @@ -74,7 +74,7 @@ public: return WavefrontSSID; } - /// \brief In AMDGPU target synchronization scopes are inclusive, meaning a + /// In AMDGPU target synchronization scopes are inclusive, meaning a /// larger synchronization scope is inclusive of a smaller synchronization /// scope. /// diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp index 7263ba73d155..995d9ae3907f 100644 --- a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp +++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp @@ -15,6 +15,7 @@ #include "AMDGPUMacroFusion.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/MacroFusion.h" @@ -22,7 +23,7 @@ using namespace llvm; namespace { -/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused +/// Check if the instr pair, FirstMI and SecondMI, should be fused /// together. Given SecondMI, when FirstMI is unspecified, then check if /// SecondMI may be part of a fused pair at all. static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_, diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp index bb65636f15af..7bd8533a0ccf 100644 --- a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // // \file -// \brief This post-linking pass replaces the function pointer of enqueued +// This post-linking pass replaces the function pointer of enqueued // block kernel with a global variable (runtime handle) and adds // "runtime-handle" attribute to the enqueued block kernel. // @@ -36,7 +36,9 @@ #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Mangler.h" #include "llvm/IR/Module.h" #include "llvm/IR/User.h" #include "llvm/Pass.h" @@ -49,7 +51,7 @@ using namespace llvm; namespace { -/// \brief Lower enqueued blocks. +/// Lower enqueued blocks. class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass { public: static char ID; @@ -80,49 +82,63 @@ static void collectCallers(Function *F, DenseSet<Function *> &Callers) { for (auto U : F->users()) { if (auto *CI = dyn_cast<CallInst>(&*U)) { auto *Caller = CI->getParent()->getParent(); - if (Callers.count(Caller)) - continue; - Callers.insert(Caller); - collectCallers(Caller, Callers); + if (Callers.insert(Caller).second) + collectCallers(Caller, Callers); } } } +/// If \p U is instruction or constant, collect functions which directly or +/// indirectly use it. +static void collectFunctionUsers(User *U, DenseSet<Function *> &Funcs) { + if (auto *I = dyn_cast<Instruction>(U)) { + auto *F = I->getParent()->getParent(); + if (Funcs.insert(F).second) + collectCallers(F, Funcs); + return; + } + if (!isa<Constant>(U)) + return; + for (auto UU : U->users()) + collectFunctionUsers(&*UU, Funcs); +} + bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { DenseSet<Function *> Callers; auto &C = M.getContext(); bool Changed = false; for (auto &F : M.functions()) { if (F.hasFnAttribute("enqueued-block")) { - if (!F.hasOneUse() || !F.user_begin()->hasOneUse() || - !isa<ConstantExpr>(*F.user_begin()) || - !isa<ConstantExpr>(*F.user_begin()->user_begin())) { - continue; + if (!F.hasName()) { + SmallString<64> Name; + Mangler::getNameWithPrefix(Name, "__amdgpu_enqueued_kernel", + M.getDataLayout()); + F.setName(Name); } - auto *BitCast = cast<ConstantExpr>(*F.user_begin()); - auto *AddrCast = cast<ConstantExpr>(*BitCast->user_begin()); - auto RuntimeHandle = (F.getName() + "_runtime_handle").str(); + LLVM_DEBUG(dbgs() << "found enqueued kernel: " << F.getName() << '\n'); + auto RuntimeHandle = (F.getName() + ".runtime_handle").str(); + auto T = ArrayType::get(Type::getInt64Ty(C), 2); auto *GV = new GlobalVariable( - M, Type::getInt8Ty(C)->getPointerTo(AMDGPUAS::GLOBAL_ADDRESS), - /*IsConstant=*/true, GlobalValue::ExternalLinkage, - /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr, - GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, - /*IsExternallyInitialized=*/true); - DEBUG(dbgs() << "runtime handle created: " << *GV << '\n'); - auto *NewPtr = ConstantExpr::getPointerCast(GV, AddrCast->getType()); - AddrCast->replaceAllUsesWith(NewPtr); - F.addFnAttr("runtime-handle", RuntimeHandle); - F.setLinkage(GlobalValue::ExternalLinkage); - - // Collect direct or indirect callers of enqueue_kernel. - for (auto U : NewPtr->users()) { - if (auto *I = dyn_cast<Instruction>(&*U)) { - auto *F = I->getParent()->getParent(); - Callers.insert(F); - collectCallers(F, Callers); - } + M, T, + /*IsConstant=*/false, GlobalValue::ExternalLinkage, + /*Initializer=*/Constant::getNullValue(T), RuntimeHandle, + /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal, + AMDGPUAS::GLOBAL_ADDRESS, + /*IsExternallyInitialized=*/false); + LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n'); + + for (auto U : F.users()) { + auto *UU = &*U; + if (!isa<ConstantExpr>(UU)) + continue; + collectFunctionUsers(UU, Callers); + auto *BitCast = cast<ConstantExpr>(UU); + auto *NewPtr = ConstantExpr::getPointerCast(GV, BitCast->getType()); + BitCast->replaceAllUsesWith(NewPtr); + F.addFnAttr("runtime-handle", RuntimeHandle); + F.setLinkage(GlobalValue::ExternalLinkage); + Changed = true; } - Changed = true; } } @@ -130,6 +146,7 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL) continue; F->addFnAttr("calls-enqueue-kernel"); + LLVM_DEBUG(dbgs() << "mark enqueue_kernel caller:" << F->getName() << '\n'); } return Changed; } diff --git a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp new file mode 100644 index 000000000000..3cfdccc9fe51 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -0,0 +1,397 @@ +//===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Analyzes if a function potentially memory bound and if a kernel +/// kernel may benefit from limiting number of waves to reduce cache thrashing. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUPerfHintAnalysis.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-perf-hint" + +static cl::opt<unsigned> + MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, + cl::desc("Function mem bound threshold in %")); + +static cl::opt<unsigned> + LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, + cl::desc("Kernel limit wave threshold in %")); + +static cl::opt<unsigned> + IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, + cl::desc("Indirect access memory instruction weight")); + +static cl::opt<unsigned> + LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, + cl::desc("Large stride memory access weight")); + +static cl::opt<unsigned> + LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, + cl::desc("Large stride memory access threshold")); + +STATISTIC(NumMemBound, "Number of functions marked as memory bound"); +STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave"); + +char llvm::AMDGPUPerfHintAnalysis::ID = 0; +char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID; + +INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE, + "Analysis if a function is memory bound", true, true) + +namespace { + +struct AMDGPUPerfHint { + friend AMDGPUPerfHintAnalysis; + +public: + AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_, + const TargetLowering *TLI_) + : FIM(FIM_), DL(nullptr), TLI(TLI_) {} + + void runOnFunction(Function &F); + +private: + struct MemAccessInfo { + const Value *V; + const Value *Base; + int64_t Offset; + MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {} + bool isLargeStride(MemAccessInfo &Reference) const; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + Printable print() const { + return Printable([this](raw_ostream &OS) { + OS << "Value: " << *V << '\n' + << "Base: " << *Base << " Offset: " << Offset << '\n'; + }); + } +#endif + }; + + MemAccessInfo makeMemAccessInfo(Instruction *) const; + + MemAccessInfo LastAccess; // Last memory access info + + AMDGPUPerfHintAnalysis::FuncInfoMap &FIM; + + const DataLayout *DL; + + AMDGPUAS AS; + + const TargetLowering *TLI; + + void visit(const Function &F); + static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F); + static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F); + + bool isIndirectAccess(const Instruction *Inst) const; + + /// Check if the instruction is large stride. + /// The purpose is to identify memory access pattern like: + /// x = a[i]; + /// y = a[i+1000]; + /// z = a[i+2000]; + /// In the above example, the second and third memory access will be marked + /// large stride memory access. + bool isLargeStride(const Instruction *Inst); + + bool isGlobalAddr(const Value *V) const; + bool isLocalAddr(const Value *V) const; + bool isConstantAddr(const Value *V) const; +}; + +static const Value *getMemoryInstrPtr(const Instruction *Inst) { + if (auto LI = dyn_cast<LoadInst>(Inst)) { + return LI->getPointerOperand(); + } + if (auto SI = dyn_cast<StoreInst>(Inst)) { + return SI->getPointerOperand(); + } + if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) { + return AI->getPointerOperand(); + } + if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) { + return AI->getPointerOperand(); + } + if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) { + return MI->getRawDest(); + } + + return nullptr; +} + +bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { + LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); + SmallSet<const Value *, 32> WorkSet; + SmallSet<const Value *, 32> Visited; + if (const Value *MO = getMemoryInstrPtr(Inst)) { + if (isGlobalAddr(MO)) + WorkSet.insert(MO); + } + + while (!WorkSet.empty()) { + const Value *V = *WorkSet.begin(); + WorkSet.erase(*WorkSet.begin()); + if (!Visited.insert(V).second) + continue; + LLVM_DEBUG(dbgs() << " check: " << *V << '\n'); + + if (auto LD = dyn_cast<LoadInst>(V)) { + auto M = LD->getPointerOperand(); + if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) { + LLVM_DEBUG(dbgs() << " is IA\n"); + return true; + } + continue; + } + + if (auto GEP = dyn_cast<GetElementPtrInst>(V)) { + auto P = GEP->getPointerOperand(); + WorkSet.insert(P); + for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I) + WorkSet.insert(GEP->getOperand(I)); + continue; + } + + if (auto U = dyn_cast<UnaryInstruction>(V)) { + WorkSet.insert(U->getOperand(0)); + continue; + } + + if (auto BO = dyn_cast<BinaryOperator>(V)) { + WorkSet.insert(BO->getOperand(0)); + WorkSet.insert(BO->getOperand(1)); + continue; + } + + if (auto S = dyn_cast<SelectInst>(V)) { + WorkSet.insert(S->getFalseValue()); + WorkSet.insert(S->getTrueValue()); + continue; + } + + if (auto E = dyn_cast<ExtractElementInst>(V)) { + WorkSet.insert(E->getVectorOperand()); + continue; + } + + LLVM_DEBUG(dbgs() << " dropped\n"); + } + + LLVM_DEBUG(dbgs() << " is not IA\n"); + return false; +} + +void AMDGPUPerfHint::visit(const Function &F) { + auto FIP = FIM.insert(std::make_pair(&F, AMDGPUPerfHintAnalysis::FuncInfo())); + if (!FIP.second) + return; + + AMDGPUPerfHintAnalysis::FuncInfo &FI = FIP.first->second; + + LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n'); + + for (auto &B : F) { + LastAccess = MemAccessInfo(); + for (auto &I : B) { + if (getMemoryInstrPtr(&I)) { + if (isIndirectAccess(&I)) + ++FI.IAMInstCount; + if (isLargeStride(&I)) + ++FI.LSMInstCount; + ++FI.MemInstCount; + ++FI.InstCount; + continue; + } + CallSite CS(const_cast<Instruction *>(&I)); + if (CS) { + Function *Callee = CS.getCalledFunction(); + if (!Callee || Callee->isDeclaration()) { + ++FI.InstCount; + continue; + } + if (&F == Callee) // Handle immediate recursion + continue; + + visit(*Callee); + auto Loc = FIM.find(Callee); + + assert(Loc != FIM.end() && "No func info"); + FI.MemInstCount += Loc->second.MemInstCount; + FI.InstCount += Loc->second.InstCount; + FI.IAMInstCount += Loc->second.IAMInstCount; + FI.LSMInstCount += Loc->second.LSMInstCount; + } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { + TargetLoweringBase::AddrMode AM; + auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL); + AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr)); + AM.HasBaseReg = !AM.BaseGV; + if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(), + GEP->getPointerAddressSpace())) + // Offset will likely be folded into load or store + continue; + ++FI.InstCount; + } else { + ++FI.InstCount; + } + } + } +} + +void AMDGPUPerfHint::runOnFunction(Function &F) { + if (FIM.find(&F) != FIM.end()) + return; + + const Module &M = *F.getParent(); + DL = &M.getDataLayout(); + AS = AMDGPU::getAMDGPUAS(M); + + visit(F); + auto Loc = FIM.find(&F); + + assert(Loc != FIM.end() && "No func info"); + LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Loc->second.MemInstCount + << '\n' + << " IAMInst: " << Loc->second.IAMInstCount << '\n' + << " LSMInst: " << Loc->second.LSMInstCount << '\n' + << " TotalInst: " << Loc->second.InstCount << '\n'); + + auto &FI = Loc->second; + + if (isMemBound(FI)) { + LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n"); + NumMemBound++; + } + + if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(FI)) { + LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n"); + NumLimitWave++; + } +} + +bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { + return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh; +} + +bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { + return ((FI.MemInstCount + FI.IAMInstCount * IAWeight + + FI.LSMInstCount * LSWeight) * + 100 / FI.InstCount) > LimitWaveThresh; +} + +bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const { + if (auto PT = dyn_cast<PointerType>(V->getType())) { + unsigned As = PT->getAddressSpace(); + // Flat likely points to global too. + return As == AS.GLOBAL_ADDRESS || As == AS.FLAT_ADDRESS; + } + return false; +} + +bool AMDGPUPerfHint::isLocalAddr(const Value *V) const { + if (auto PT = dyn_cast<PointerType>(V->getType())) + return PT->getAddressSpace() == AS.LOCAL_ADDRESS; + return false; +} + +bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) { + LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n'); + + MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst)); + bool IsLargeStride = MAI.isLargeStride(LastAccess); + if (MAI.Base) + LastAccess = std::move(MAI); + + return IsLargeStride; +} + +AMDGPUPerfHint::MemAccessInfo +AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const { + MemAccessInfo MAI; + const Value *MO = getMemoryInstrPtr(Inst); + + LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n'); + // Do not treat local-addr memory access as large stride. + if (isLocalAddr(MO)) + return MAI; + + MAI.V = MO; + MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL); + return MAI; +} + +bool AMDGPUPerfHint::isConstantAddr(const Value *V) const { + if (auto PT = dyn_cast<PointerType>(V->getType())) { + unsigned As = PT->getAddressSpace(); + return As == AS.CONSTANT_ADDRESS || As == AS.CONSTANT_ADDRESS_32BIT; + } + return false; +} + +bool AMDGPUPerfHint::MemAccessInfo::isLargeStride( + MemAccessInfo &Reference) const { + + if (!Base || !Reference.Base || Base != Reference.Base) + return false; + + uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset + : Reference.Offset - Offset; + bool Result = Diff > LargeStrideThresh; + LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n" + << print() << "<=>\n" + << Reference.print() << "Result:" << Result << '\n'); + return Result; +} +} // namespace + +bool AMDGPUPerfHintAnalysis::runOnFunction(Function &F) { + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + const TargetMachine &TM = TPC->getTM<TargetMachine>(); + const TargetSubtargetInfo *ST = TM.getSubtargetImpl(F); + + AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering()); + Analyzer.runOnFunction(F); + return false; +} + +bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const { + auto FI = FIM.find(F); + if (FI == FIM.end()) + return false; + + return AMDGPUPerfHint::isMemBound(FI->second); +} + +bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const { + auto FI = FIM.find(F); + if (FI == FIM.end()) + return false; + + return AMDGPUPerfHint::needLimitWave(FI->second); +} diff --git a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h new file mode 100644 index 000000000000..be7f37cb6815 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h @@ -0,0 +1,55 @@ +//===- AMDGPUPerfHintAnalysis.h - analysis of functions memory traffic ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Analyzes if a function potentially memory bound and if a kernel +/// kernel may benefit from limiting number of waves to reduce cache thrashing. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H +#define LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H +#include "llvm/IR/ValueMap.h" +#include "llvm/Pass.h" + +namespace llvm { + +struct AMDGPUPerfHintAnalysis : public FunctionPass { + static char ID; + +public: + AMDGPUPerfHintAnalysis() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + + bool isMemoryBound(const Function *F) const; + + bool needsWaveLimiter(const Function *F) const; + + struct FuncInfo { + unsigned MemInstCount; + unsigned InstCount; + unsigned IAMInstCount; // Indirect access memory instruction count + unsigned LSMInstCount; // Large stride memory instruction count + FuncInfo() : MemInstCount(0), InstCount(0), IAMInstCount(0), + LSMInstCount(0) {} + }; + + typedef ValueMap<const Function*, FuncInfo> FuncInfoMap; + +private: + + FuncInfoMap FIM; +}; +} // namespace llvm +#endif // LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 41876ed45c8c..d341fec6296f 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -65,6 +65,11 @@ using namespace llvm; namespace { +static cl::opt<bool> DisablePromoteAllocaToVector( + "disable-promote-alloca-to-vector", + cl::desc("Disable promote alloca to vector"), + cl::init(false)); + // FIXME: This can create globals so should be a module pass. class AMDGPUPromoteAlloca : public FunctionPass { private: @@ -147,7 +152,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { IsAMDGCN = TT.getArch() == Triple::amdgcn; IsAMDHSA = TT.getOS() == Triple::AMDHSA; - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F); if (!ST.isPromoteAllocaEnabled()) return false; @@ -169,8 +174,8 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { std::pair<Value *, Value *> AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>( - *Builder.GetInsertBlock()->getParent()); + const Function &F = *Builder.GetInsertBlock()->getParent(); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F); if (!IsAMDHSA) { Function *LocalSizeYFn @@ -256,8 +261,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { } Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) { - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>( - *Builder.GetInsertBlock()->getParent()); + const AMDGPUSubtarget &ST = + AMDGPUSubtarget::get(*TM, *Builder.GetInsertBlock()->getParent()); Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic; switch (N) { @@ -318,18 +323,19 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { static bool canVectorizeInst(Instruction *Inst, User *User) { switch (Inst->getOpcode()) { case Instruction::Load: { + // Currently only handle the case where the Pointer Operand is a GEP. + // Also we could not vectorize volatile or atomic loads. LoadInst *LI = cast<LoadInst>(Inst); - // Currently only handle the case where the Pointer Operand is a GEP so check for that case. - return isa<GetElementPtrInst>(LI->getPointerOperand()) && !LI->isVolatile(); + return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple(); } case Instruction::BitCast: - case Instruction::AddrSpaceCast: return true; case Instruction::Store: { // Must be the stored pointer operand, not a stored value, plus // since it should be canonical form, the User should be a GEP. + // Also we could not vectorize volatile or atomic stores. StoreInst *SI = cast<StoreInst>(Inst); - return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && !SI->isVolatile(); + return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple(); } default: return false; @@ -337,19 +343,25 @@ static bool canVectorizeInst(Instruction *Inst, User *User) { } static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) { + + if (DisablePromoteAllocaToVector) { + LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n"); + return false; + } + ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType()); - DEBUG(dbgs() << "Alloca candidate for vectorization\n"); + LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n"); // FIXME: There is no reason why we can't support larger arrays, we // are just being conservative for now. // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these // could also be promoted but we don't currently handle this case if (!AllocaTy || - AllocaTy->getNumElements() > 4 || + AllocaTy->getNumElements() > 16 || AllocaTy->getNumElements() < 2 || !VectorType::isValidElementType(AllocaTy->getElementType())) { - DEBUG(dbgs() << " Cannot convert type to vector\n"); + LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n"); return false; } @@ -370,7 +382,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) { // If we can't compute a vector index from this GEP, then we can't // promote this alloca to vector. if (!Index) { - DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n'); + LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP + << '\n'); return false; } @@ -385,8 +398,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) { VectorType *VectorTy = arrayTypeToVecType(AllocaTy); - DEBUG(dbgs() << " Converting alloca to vector " - << *AllocaTy << " -> " << *VectorTy << '\n'); + LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " + << *VectorTy << '\n'); for (Value *V : WorkList) { Instruction *Inst = cast<Instruction>(V); @@ -443,7 +456,8 @@ static bool isCallPromotable(CallInst *CI) { case Intrinsic::lifetime_end: case Intrinsic::invariant_start: case Intrinsic::invariant_end: - case Intrinsic::invariant_group_barrier: + case Intrinsic::launder_invariant_group: + case Intrinsic::strip_invariant_group: case Intrinsic::objectsize: return true; default: @@ -475,7 +489,8 @@ bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca, // important part is both must have the same address space at // the end. if (OtherObj != BaseAlloca) { - DEBUG(dbgs() << "Found a binary instruction with another alloca object\n"); + LLVM_DEBUG( + dbgs() << "Found a binary instruction with another alloca object\n"); return false; } @@ -588,7 +603,7 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes( bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) { FunctionType *FTy = F.getFunctionType(); - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F); // If the function has any arguments in the local address space, then it's // possible these arguments require the entire local memory space, so @@ -597,8 +612,8 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) { PointerType *PtrTy = dyn_cast<PointerType>(ParamTy); if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) { LocalMemLimit = 0; - DEBUG(dbgs() << "Function has local memory argument. Promoting to " - "local memory disabled.\n"); + LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to " + "local memory disabled.\n"); return false; } } @@ -667,13 +682,12 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) { LocalMemLimit = MaxSizeWithWaveCount; - DEBUG( - dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n" - << " Rounding size to " << MaxSizeWithWaveCount - << " with a maximum occupancy of " << MaxOccupancy << '\n' - << " and " << (LocalMemLimit - CurrentLocalMemUsage) - << " available for promotion\n" - ); + LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage + << " bytes of LDS\n" + << " Rounding size to " << MaxSizeWithWaveCount + << " with a maximum occupancy of " << MaxOccupancy << '\n' + << " and " << (LocalMemLimit - CurrentLocalMemUsage) + << " available for promotion\n"); return true; } @@ -690,7 +704,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { // First try to replace the alloca with a vector Type *AllocaTy = I.getAllocatedType(); - DEBUG(dbgs() << "Trying to promote " << I << '\n'); + LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n'); if (tryPromoteAllocaToVector(&I, AS)) return true; // Promoted to vector. @@ -706,7 +720,9 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { case CallingConv::SPIR_KERNEL: break; default: - DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n"); + LLVM_DEBUG( + dbgs() + << " promote alloca to LDS not supported with calling convention.\n"); return false; } @@ -714,8 +730,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { if (!SufficientLDS) return false; - const AMDGPUSubtarget &ST = - TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction); unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; const DataLayout &DL = Mod->getDataLayout(); @@ -735,8 +750,8 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { NewSize += AllocSize; if (NewSize > LocalMemLimit) { - DEBUG(dbgs() << " " << AllocSize - << " bytes of local memory not available to promote\n"); + LLVM_DEBUG(dbgs() << " " << AllocSize + << " bytes of local memory not available to promote\n"); return false; } @@ -745,11 +760,11 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { std::vector<Value*> WorkList; if (!collectUsesWithPtrTypes(&I, &I, WorkList)) { - DEBUG(dbgs() << " Do not know how to convert all uses\n"); + LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n"); return false; } - DEBUG(dbgs() << "Promoting alloca to local memory\n"); + LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n"); Function *F = I.getParent()->getParent(); @@ -843,31 +858,32 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { continue; case Intrinsic::memcpy: { MemCpyInst *MemCpy = cast<MemCpyInst>(Intr); - Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(), - MemCpy->getLength(), MemCpy->getAlignment(), - MemCpy->isVolatile()); + Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlignment(), + MemCpy->getRawSource(), MemCpy->getSourceAlignment(), + MemCpy->getLength(), MemCpy->isVolatile()); Intr->eraseFromParent(); continue; } case Intrinsic::memmove: { MemMoveInst *MemMove = cast<MemMoveInst>(Intr); - Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(), - MemMove->getLength(), MemMove->getAlignment(), - MemMove->isVolatile()); + Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlignment(), + MemMove->getRawSource(), MemMove->getSourceAlignment(), + MemMove->getLength(), MemMove->isVolatile()); Intr->eraseFromParent(); continue; } case Intrinsic::memset: { MemSetInst *MemSet = cast<MemSetInst>(Intr); Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), - MemSet->getLength(), MemSet->getAlignment(), + MemSet->getLength(), MemSet->getDestAlignment(), MemSet->isVolatile()); Intr->eraseFromParent(); continue; } case Intrinsic::invariant_start: case Intrinsic::invariant_end: - case Intrinsic::invariant_group_barrier: + case Intrinsic::launder_invariant_group: + case Intrinsic::strip_invariant_group: Intr->eraseFromParent(); // FIXME: I think the invariant marker should still theoretically apply, // but the intrinsics need to be changed to accept pointers with any diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 1ed02fae085a..012e4fe200aa 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -14,7 +14,9 @@ #include "AMDGPURegisterBankInfo.h" #include "AMDGPUInstrInfo.h" +#include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -50,10 +52,38 @@ AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI) } -unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &A, - const RegisterBank &B, - unsigned Size) const { - return RegisterBankInfo::copyCost(A, B, Size); +static bool isConstant(const MachineOperand &MO, int64_t &C) { + const MachineFunction *MF = MO.getParent()->getParent()->getParent(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const MachineInstr *Def = MRI.getVRegDef(MO.getReg()); + if (!Def) + return false; + + if (Def->getOpcode() == AMDGPU::G_CONSTANT) { + C = Def->getOperand(1).getCImm()->getSExtValue(); + return true; + } + + if (Def->getOpcode() == AMDGPU::COPY) + return isConstant(Def->getOperand(1), C); + + return false; +} + +unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, + const RegisterBank &Src, + unsigned Size) const { + if (Dst.getID() == AMDGPU::SGPRRegBankID && + Src.getID() == AMDGPU::VGPRRegBankID) + return std::numeric_limits<unsigned>::max(); + + // SGPRRegBank with size 1 is actually vcc or another 64-bit sgpr written by + // the valu. + if (Size == 1 && Dst.getID() == AMDGPU::SCCRegBankID && + Src.getID() == AMDGPU::SGPRRegBankID) + return std::numeric_limits<unsigned>::max(); + + return RegisterBankInfo::copyCost(Dst, Src, Size); } const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass( @@ -72,11 +102,11 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); InstructionMappings AltMappings; switch (MI.getOpcode()) { case TargetOpcode::G_LOAD: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); // FIXME: Should we be hard coding the size for these mappings? const InstructionMapping &SSMapping = getInstructionMapping( 1, 1, getOperandsMapping( @@ -104,6 +134,42 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( return AltMappings; } + case TargetOpcode::G_ICMP: { + unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); + const InstructionMapping &SSMapping = getInstructionMapping(1, 1, + getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), + nullptr, // Predicate operand. + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), + 4); // Num Operands + AltMappings.push_back(&SSMapping); + + const InstructionMapping &SVMapping = getInstructionMapping(2, 1, + getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), + nullptr, // Predicate operand. + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}), + 4); // Num Operands + AltMappings.push_back(&SVMapping); + + const InstructionMapping &VSMapping = getInstructionMapping(3, 1, + getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), + nullptr, // Predicate operand. + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), + 4); // Num Operands + AltMappings.push_back(&VSMapping); + + const InstructionMapping &VVMapping = getInstructionMapping(4, 1, + getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), + nullptr, // Predicate operand. + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}), + 4); // Num Operands + AltMappings.push_back(&VVMapping); + + return AltMappings; + } default: break; } @@ -120,7 +186,60 @@ static bool isInstrUniform(const MachineInstr &MI) { return false; const MachineMemOperand *MMO = *MI.memoperands_begin(); - return AMDGPU::isUniformMMO(MMO); + return AMDGPUInstrInfo::isUniformMMO(MMO); +} + +bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) { + unsigned Reg = MI.getOperand(i).getReg(); + const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); + if (Bank && Bank->getID() != AMDGPU::SGPRRegBankID) + return false; + } + return true; +} + +const RegisterBankInfo::InstructionMapping & +AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); + + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); + OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + } + return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), + MI.getNumOperands()); +} + +const RegisterBankInfo::InstructionMapping & +AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); + unsigned OpdIdx = 0; + + unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0); + + if (MI.getOperand(OpdIdx).isIntrinsicID()) + OpdsMapping[OpdIdx++] = nullptr; + + unsigned Reg1 = MI.getOperand(OpdIdx).getReg(); + unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI); + unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI); + OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1); + + for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) { + unsigned Size = getSizeInBits(MI.getOperand(OpdIdx).getReg(), MRI, *TRI); + OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + } + + return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), + MI.getNumOperands()); } const RegisterBankInfo::InstructionMapping & @@ -155,6 +274,22 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { // handle that during instruction selection? } +unsigned +AMDGPURegisterBankInfo::getRegBankID(unsigned Reg, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, + unsigned Default) const { + + const RegisterBank *Bank = getRegBank(Reg, MRI, TRI); + return Bank ? Bank->getID() : Default; +} + +/// +/// This function must return a legal mapping, because +/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called +/// in RegBankSelect::Mode::Fast. Any mapping that would cause a +/// VGPR to SGPR generated is illegal. +/// const RegisterBankInfo::InstructionMapping & AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); @@ -166,16 +301,102 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { const MachineRegisterInfo &MRI = MF.getRegInfo(); SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); - bool IsComplete = true; switch (MI.getOpcode()) { default: - IsComplete = false; + return getInvalidInstructionMapping(); + case AMDGPU::G_ADD: + case AMDGPU::G_SUB: + case AMDGPU::G_MUL: + case AMDGPU::G_AND: + case AMDGPU::G_OR: + case AMDGPU::G_XOR: + case AMDGPU::G_SHL: + if (isSALUMapping(MI)) + return getDefaultMappingSOP(MI); + // Fall-through + + case AMDGPU::G_FADD: + case AMDGPU::G_FPTOSI: + case AMDGPU::G_FPTOUI: + case AMDGPU::G_FMUL: + return getDefaultMappingVOP(MI); + case AMDGPU::G_IMPLICIT_DEF: { + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; + } + case AMDGPU::G_FCONSTANT: case AMDGPU::G_CONSTANT: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } + case AMDGPU::G_EXTRACT: { + unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); + unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); + OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); + OpdsMapping[2] = nullptr; + break; + } + case AMDGPU::G_MERGE_VALUES: { + unsigned Bank = isSALUMapping(MI) ? + AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); + + OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); + // Op1 and Dst should use the same register bank. + for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) + OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); + break; + } + case AMDGPU::G_BITCAST: { + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); + OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); + break; + } + case AMDGPU::G_TRUNC: { + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src = MI.getOperand(1).getReg(); + unsigned Bank = getRegBankID(Src, MRI, *TRI); + unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); + unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); + OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); + break; + } + case AMDGPU::G_ZEXT: { + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src = MI.getOperand(1).getReg(); + unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); + unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); + unsigned SrcBank = getRegBankID(Src, MRI, *TRI, + SrcSize == 1 ? AMDGPU::SGPRRegBankID : + AMDGPU::VGPRRegBankID); + unsigned DstBank = SrcBank; + if (SrcSize == 1) { + if (SrcBank == AMDGPU::SGPRRegBankID) + DstBank = AMDGPU::VGPRRegBankID; + else + DstBank = AMDGPU::SGPRRegBankID; + } + + OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize); + OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank, SrcSize); + break; + } + case AMDGPU::G_FCMP: { + unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); + unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 1); + OpdsMapping[1] = nullptr; // Predicate Operand. + OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + break; + } case AMDGPU::G_GEP: { for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { if (!MI.getOperand(i).isReg()) @@ -204,24 +425,113 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } - case AMDGPU::G_LOAD: - return getInstrMappingForLoad(MI); + case AMDGPU::G_ICMP: { + unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); + unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); + unsigned Op0Bank = Op2Bank == AMDGPU::SGPRRegBankID && + Op3Bank == AMDGPU::SGPRRegBankID ? + AMDGPU::SCCRegBankID : AMDGPU::VGPRRegBankID; + OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1); + OpdsMapping[1] = nullptr; // Predicate Operand. + OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); + OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size); + break; + } + + + case AMDGPU::G_EXTRACT_VECTOR_ELT: { + unsigned IdxOp = 2; + int64_t Imm; + // XXX - Do we really need to fully handle these? The constant case should + // be legalized away before RegBankSelect? + + unsigned OutputBankID = isSALUMapping(MI) && isConstant(MI.getOperand(IdxOp), Imm) ? + AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; + + unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, MRI.getType(MI.getOperand(0).getReg()).getSizeInBits()); + OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, MRI.getType(MI.getOperand(1).getReg()).getSizeInBits()); + + // The index can be either if the source vector is VGPR. + OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, MRI.getType(MI.getOperand(2).getReg()).getSizeInBits()); + break; } + case AMDGPU::G_INSERT_VECTOR_ELT: { + // XXX - Do we really need to fully handle these? The constant case should + // be legalized away before RegBankSelect? + + int64_t Imm; + + unsigned IdxOp = MI.getOpcode() == AMDGPU::G_EXTRACT_VECTOR_ELT ? 2 : 3; + unsigned BankID = isSALUMapping(MI) && isConstant(MI.getOperand(IdxOp), Imm) ? + AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; + + + + // TODO: Can do SGPR indexing, which would obviate the need for the + // isConstant check. + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); + OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); + } - if (!IsComplete) { - unsigned BankID = AMDGPU::SGPRRegBankID; - unsigned Size = 0; - for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) { - // If the operand is not a register default to the size of the previous - // operand. - // FIXME: Can't we pull the types from the MachineInstr rather than the - // operands. - if (MI.getOperand(Idx).isReg()) - Size = getSizeInBits(MI.getOperand(Idx).getReg(), MRI, *TRI); - OpdsMapping.push_back(AMDGPU::getValueMapping(BankID, Size)); + break; + } + case AMDGPU::G_INTRINSIC: { + switch (MI.getOperand(1).getIntrinsicID()) { + default: + return getInvalidInstructionMapping(); + case Intrinsic::maxnum: + case Intrinsic::minnum: + case Intrinsic::amdgcn_cvt_pkrtz: + return getDefaultMappingVOP(MI); + case Intrinsic::amdgcn_kernarg_segment_ptr: { + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + break; + } + } + break; + } + case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { + switch (MI.getOperand(0).getIntrinsicID()) { + default: + return getInvalidInstructionMapping(); + case Intrinsic::amdgcn_exp_compr: + OpdsMapping[0] = nullptr; // IntrinsicID + // FIXME: These are immediate values which can't be read from registers. + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); + // FIXME: Could we support packed types here? + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + // FIXME: These are immediate values which can't be read from registers. + OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); + OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); + break; + case Intrinsic::amdgcn_exp: + OpdsMapping[0] = nullptr; // IntrinsicID + // FIXME: These are immediate values which can't be read from registers. + OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); + // FIXME: Could we support packed types here? + OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); + // FIXME: These are immediate values which can't be read from registers. + OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); + OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); + break; } + break; + } + case AMDGPU::G_LOAD: + return getInstrMappingForLoad(MI); } + return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); } diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index 201fdc1974c6..d48a66589873 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -16,19 +16,15 @@ #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#define GET_REGBANK_DECLARATIONS +#include "AMDGPUGenRegisterBank.inc" +#undef GET_REGBANK_DECLARATIONS + namespace llvm { class SIRegisterInfo; class TargetRegisterInfo; -namespace AMDGPU { -enum { - SGPRRegBankID = 0, - VGPRRegBankID = 1, - NumRegisterBanks -}; -} // End AMDGPU namespace. - /// This class provides the information for the target register banks. class AMDGPUGenRegisterBankInfo : public RegisterBankInfo { @@ -46,6 +42,13 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { const RegisterBankInfo::InstructionMapping & getInstrMappingForLoad(const MachineInstr &MI) const; + unsigned getRegBankID(unsigned Reg, const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI, + unsigned Default = AMDGPU::VGPRRegBankID) const; + + bool isSALUMapping(const MachineInstr &MI) const; + const InstructionMapping &getDefaultMappingSOP(const MachineInstr &MI) const; + const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const; public: AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI); diff --git a/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/lib/Target/AMDGPU/AMDGPURegisterBanks.td index f4428e56035f..7f7f75f65647 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBanks.td +++ b/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -14,3 +14,5 @@ def SGPRRegBank : RegisterBank<"SGPR", def VGPRRegBank : RegisterBank<"VGPR", [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512] >; + +def SCCRegBank : RegisterBank <"SCC", [SCC_CLASS ]>; diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp index 5e4d33aaa691..50f859addc2b 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -8,13 +8,15 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Parent TargetRegisterInfo class common to all hw codegen targets. +/// Parent TargetRegisterInfo class common to all hw codegen targets. // //===----------------------------------------------------------------------===// #include "AMDGPURegisterInfo.h" #include "AMDGPUTargetMachine.h" +#include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" using namespace llvm; @@ -25,7 +27,7 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {} // they are not supported at this time. //===----------------------------------------------------------------------===// -unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { +unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) { static const unsigned SubRegs[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9, @@ -37,6 +39,13 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { return SubRegs[Channel]; } +void AMDGPURegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const { + MCRegAliasIterator R(Reg, this, true); + + for (; R.isValid(); ++R) + Reserved.set(*R); +} + #define GET_REGINFO_TARGET_DESC #include "AMDGPUGenRegisterInfo.inc" @@ -75,5 +84,6 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, } unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { - return AMDGPU::NoRegister; + const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + return FuncInfo->getFrameOffsetReg(); } diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h index d8604d2590f1..07de5fc549e2 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief TargetRegisterInfo interface that is implemented by all hw codegen +/// TargetRegisterInfo interface that is implemented by all hw codegen /// targets. // //===----------------------------------------------------------------------===// @@ -21,15 +21,19 @@ namespace llvm { -class AMDGPUSubtarget; +class GCNSubtarget; class TargetInstrInfo; struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { AMDGPURegisterInfo(); + bool enableMultipleCopyHints() const override { return true; } + /// \returns the sub reg enum value for the given \p Channel /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) - unsigned getSubRegFromChannel(unsigned Channel) const; + static unsigned getSubRegFromChannel(unsigned Channel); + + void reserveRegisterTuples(BitVector &, unsigned Reg) const; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/lib/Target/AMDGPU/AMDGPURegisterInfo.td index 3bbcba826f63..ceabae524414 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.td +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.td @@ -19,5 +19,4 @@ foreach Index = 0-15 in { } -include "R600RegisterInfo.td" include "SIRegisterInfo.td" diff --git a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp index 83e56a9ab495..a861762a8c9e 100644 --- a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp +++ b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp @@ -249,8 +249,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { SmallVector<Argument *, 4> OutArgs; for (Argument &Arg : F.args()) { if (isOutArgumentCandidate(Arg)) { - DEBUG(dbgs() << "Found possible out argument " << Arg - << " in function " << F.getName() << '\n'); + LLVM_DEBUG(dbgs() << "Found possible out argument " << Arg + << " in function " << F.getName() << '\n'); OutArgs.push_back(&Arg); } } @@ -310,7 +310,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { SI = dyn_cast<StoreInst>(Q.getInst()); if (SI) { - DEBUG(dbgs() << "Found out argument store: " << *SI << '\n'); + LLVM_DEBUG(dbgs() << "Found out argument store: " << *SI << '\n'); ReplaceableStores.emplace_back(RI, SI); } else { ThisReplaceable = false; @@ -328,7 +328,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { if (llvm::find_if(ValVec, [OutArg](const std::pair<Argument *, Value *> &Entry) { return Entry.first == OutArg;}) != ValVec.end()) { - DEBUG(dbgs() << "Saw multiple out arg stores" << *OutArg << '\n'); + LLVM_DEBUG(dbgs() + << "Saw multiple out arg stores" << *OutArg << '\n'); // It is possible to see stores to the same argument multiple times, // but we expect these would have been optimized out already. ThisReplaceable = false; @@ -358,7 +359,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { F.getFunctionType()->params(), F.isVarArg()); - DEBUG(dbgs() << "Computed new return type: " << *NewRetTy << '\n'); + LLVM_DEBUG(dbgs() << "Computed new return type: " << *NewRetTy << '\n'); Function *NewFunc = Function::Create(NewFuncTy, Function::PrivateLinkage, F.getName() + ".body"); diff --git a/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/lib/Target/AMDGPU/AMDGPUSearchableTables.td new file mode 100644 index 000000000000..9dbd7751b4d8 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -0,0 +1,77 @@ +//===-- AMDGPUSearchableTables.td - ------------------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Resource intrinsics table. +//===----------------------------------------------------------------------===// + +class RsrcIntrinsic<AMDGPURsrcIntrinsic intr> { + Intrinsic Intr = !cast<Intrinsic>(intr); + bits<8> RsrcArg = intr.RsrcArg; + bit IsImage = intr.IsImage; +} + +def RsrcIntrinsics : GenericTable { + let FilterClass = "RsrcIntrinsic"; + let Fields = ["Intr", "RsrcArg", "IsImage"]; + + let PrimaryKey = ["Intr"]; + let PrimaryKeyName = "lookupRsrcIntrinsic"; +} + +foreach intr = !listconcat(AMDGPUBufferIntrinsics, + AMDGPUImageDimIntrinsics, + AMDGPUImageDimAtomicIntrinsics) in { + def : RsrcIntrinsic<!cast<AMDGPURsrcIntrinsic>(intr)>; +} + +class SourceOfDivergence<Intrinsic intr> { + Intrinsic Intr = intr; +} + +def SourcesOfDivergence : GenericTable { + let FilterClass = "SourceOfDivergence"; + let Fields = ["Intr"]; + + let PrimaryKey = ["Intr"]; + let PrimaryKeyName = "lookupSourceOfDivergence"; +} + +def : SourceOfDivergence<int_amdgcn_workitem_id_x>; +def : SourceOfDivergence<int_amdgcn_workitem_id_y>; +def : SourceOfDivergence<int_amdgcn_workitem_id_z>; +def : SourceOfDivergence<int_amdgcn_interp_mov>; +def : SourceOfDivergence<int_amdgcn_interp_p1>; +def : SourceOfDivergence<int_amdgcn_interp_p2>; +def : SourceOfDivergence<int_amdgcn_mbcnt_hi>; +def : SourceOfDivergence<int_amdgcn_mbcnt_lo>; +def : SourceOfDivergence<int_r600_read_tidig_x>; +def : SourceOfDivergence<int_r600_read_tidig_y>; +def : SourceOfDivergence<int_r600_read_tidig_z>; +def : SourceOfDivergence<int_amdgcn_atomic_inc>; +def : SourceOfDivergence<int_amdgcn_atomic_dec>; +def : SourceOfDivergence<int_amdgcn_ds_fadd>; +def : SourceOfDivergence<int_amdgcn_ds_fmin>; +def : SourceOfDivergence<int_amdgcn_ds_fmax>; +def : SourceOfDivergence<int_amdgcn_buffer_atomic_swap>; +def : SourceOfDivergence<int_amdgcn_buffer_atomic_add>; +def : SourceOfDivergence<int_amdgcn_buffer_atomic_sub>; +def : SourceOfDivergence<int_amdgcn_buffer_atomic_smin>; +def : SourceOfDivergence<int_amdgcn_buffer_atomic_umin>; +def : SourceOfDivergence<int_amdgcn_buffer_atomic_smax>; +def : SourceOfDivergence<int_amdgcn_buffer_atomic_umax>; +def : SourceOfDivergence<int_amdgcn_buffer_atomic_and>; +def : SourceOfDivergence<int_amdgcn_buffer_atomic_or>; +def : SourceOfDivergence<int_amdgcn_buffer_atomic_xor>; +def : SourceOfDivergence<int_amdgcn_buffer_atomic_cmpswap>; +def : SourceOfDivergence<int_amdgcn_ps_live>; +def : SourceOfDivergence<int_amdgcn_ds_swizzle>; + +foreach intr = AMDGPUImageDimAtomicIntrinsics in +def : SourceOfDivergence<intr>; diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 80feaa44766f..98b49070fa99 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Implements the AMDGPU specific subclass of TargetSubtarget. +/// Implements the AMDGPU specific subclass of TargetSubtarget. // //===----------------------------------------------------------------------===// @@ -20,8 +20,10 @@ #include "AMDGPULegalizerInfo.h" #include "AMDGPURegisterBankInfo.h" #include "SIMachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/IR/MDBuilder.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include <algorithm> @@ -32,12 +34,37 @@ using namespace llvm; #define GET_SUBTARGETINFO_TARGET_DESC #define GET_SUBTARGETINFO_CTOR +#define AMDGPUSubtarget GCNSubtarget #include "AMDGPUGenSubtargetInfo.inc" +#define GET_SUBTARGETINFO_TARGET_DESC +#define GET_SUBTARGETINFO_CTOR +#undef AMDGPUSubtarget +#include "R600GenSubtargetInfo.inc" -AMDGPUSubtarget::~AMDGPUSubtarget() = default; +GCNSubtarget::~GCNSubtarget() = default; + +R600Subtarget & +R600Subtarget::initializeSubtargetDependencies(const Triple &TT, + StringRef GPU, StringRef FS) { + SmallString<256> FullFS("+promote-alloca,+dx10-clamp,"); + FullFS += FS; + ParseSubtargetFeatures(GPU, FullFS); + + // FIXME: I don't think think Evergreen has any useful support for + // denormals, but should be checked. Should we issue a warning somewhere + // if someone tries to enable these? + if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + FP32Denormals = false; + } + + HasMulU24 = getGeneration() >= EVERGREEN; + HasMulI24 = hasCaymanISA(); + + return *this; +} -AMDGPUSubtarget & -AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, +GCNSubtarget & +GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS) { // Determine default and user-specified characteristics // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be @@ -92,26 +119,43 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, HasMovrel = true; } + HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; + return *this; } -AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, - const TargetMachine &TM) - : AMDGPUGenSubtargetInfo(TT, GPU, FS), +AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, + const FeatureBitset &FeatureBits) : + TargetTriple(TT), + SubtargetFeatureBits(FeatureBits), + Has16BitInsts(false), + HasMadMixInsts(false), + FP32Denormals(false), + FPExceptions(false), + HasSDWA(false), + HasVOP3PInsts(false), + HasMulI24(true), + HasMulU24(true), + HasFminFmaxLegacy(true), + EnablePromoteAlloca(false), + LocalMemorySize(0), + WavefrontSize(0) + { } + +GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, + const GCNTargetMachine &TM) : + AMDGPUGenSubtargetInfo(TT, GPU, FS), + AMDGPUSubtarget(TT, getFeatureBits()), TargetTriple(TT), - Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), + Gen(SOUTHERN_ISLANDS), IsaVersion(ISAVersion0_0_0), - WavefrontSize(0), - LocalMemorySize(0), LDSBankCount(0), MaxPrivateElementSize(0), FastFMAF32(false), HalfRate64Ops(false), - FP32Denormals(false), FP64FP16Denormals(false), - FPExceptions(false), DX10Clamp(false), FlatForGlobal(false), AutoWaitcntBeforeBarrier(false), @@ -123,57 +167,56 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, EnableXNACK(false), TrapHandler(false), DebuggerInsertNops(false), - DebuggerReserveRegs(false), DebuggerEmitPrologue(false), EnableHugePrivateBuffer(false), EnableVGPRSpilling(false), - EnablePromoteAlloca(false), EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), EnableSIScheduler(false), + EnableDS128(false), DumpCode(false), FP64(false), - FMA(false), - IsGCN(false), GCN3Encoding(false), CIInsts(false), GFX9Insts(false), SGPRInitBug(false), HasSMemRealTime(false), - Has16BitInsts(false), HasIntClamp(false), - HasVOP3PInsts(false), - HasMadMixInsts(false), + HasFmaMixInsts(false), HasMovrel(false), HasVGPRIndexMode(false), HasScalarStores(false), + HasScalarAtomics(false), HasInv2PiInlineImm(false), - HasSDWA(false), HasSDWAOmod(false), HasSDWAScalar(false), HasSDWASdst(false), HasSDWAMac(false), HasSDWAOutModsVOPC(false), HasDPP(false), + HasDLInsts(false), + D16PreservesUnusedBits(false), FlatAddressSpace(false), FlatInstOffsets(false), FlatGlobalInsts(false), FlatScratchInsts(false), AddNoCarryInsts(false), + HasUnpackedD16VMem(false), - R600ALUInst(false), - CaymanISA(false), - CFALUBug(false), - HasVertexCache(false), - TexVTXClauseSize(0), ScalarizeGlobal(false), FeatureDisable(false), - InstrItins(getInstrItineraryForCPU(GPU)) { + InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), + TLInfo(TM, *this), + FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { AS = AMDGPU::getAMDGPUAS(TT); - initializeSubtargetDependencies(TT, GPU, FS); + CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); + Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); + RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); + InstSelector.reset(new AMDGPUInstructionSelector( + *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM)); } unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, @@ -198,6 +241,12 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, return NumWaves; } +unsigned +AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { + const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); + return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); +} + std::pair<unsigned, unsigned> AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { switch (CC) { @@ -357,27 +406,64 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { return true; } -R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, - const TargetMachine &TM) : - AMDGPUSubtarget(TT, GPU, FS, TM), - InstrInfo(*this), - FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), - TLInfo(TM, *this) {} +uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, + unsigned &MaxAlign) const { + assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || + F.getCallingConv() == CallingConv::SPIR_KERNEL); -SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, - const TargetMachine &TM) - : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this), - FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), - TLInfo(TM, *this) { - CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); - Legalizer.reset(new AMDGPULegalizerInfo()); + const DataLayout &DL = F.getParent()->getDataLayout(); + uint64_t ExplicitArgBytes = 0; + MaxAlign = 1; - RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); - InstSelector.reset(new AMDGPUInstructionSelector( - *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()))); + for (const Argument &Arg : F.args()) { + Type *ArgTy = Arg.getType(); + + unsigned Align = DL.getABITypeAlignment(ArgTy); + uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); + ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; + MaxAlign = std::max(MaxAlign, Align); + } + + return ExplicitArgBytes; } -void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, +unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, + unsigned &MaxAlign) const { + uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); + + unsigned ExplicitOffset = getExplicitKernelArgOffset(F); + + uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; + unsigned ImplicitBytes = getImplicitArgNumBytes(F); + if (ImplicitBytes != 0) { + unsigned Alignment = getAlignmentForImplicitArgPtr(); + TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; + } + + // Being able to dereference past the end is useful for emitting scalar loads. + return alignTo(TotalSize, 4); +} + +R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, + const TargetMachine &TM) : + R600GenSubtargetInfo(TT, GPU, FS), + AMDGPUSubtarget(TT, getFeatureBits()), + InstrInfo(*this), + FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), + FMA(false), + CaymanISA(false), + CFALUBug(false), + DX10Clamp(false), + HasVertexCache(false), + R600ALUInst(false), + FP64(false), + TexVTXClauseSize(0), + Gen(R600), + TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), + InstrItins(getInstrItineraryForCPU(GPU)), + AS (AMDGPU::getAMDGPUAS(TT)) { } + +void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const { // Track register pressure so the scheduler can try to decrease // pressure once register usage is above the threshold defined by @@ -394,22 +480,12 @@ void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, Policy.ShouldTrackLaneMasks = true; } -bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { +bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const { return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); } -unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF, - unsigned ExplicitArgBytes) const { - unsigned ImplicitBytes = getImplicitArgNumBytes(MF); - if (ImplicitBytes == 0) - return ExplicitArgBytes; - - unsigned Alignment = getAlignmentForImplicitArgPtr(); - return alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; -} - -unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { - if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { +unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { + if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { if (SGPRs <= 80) return 10; if (SGPRs <= 88) @@ -431,7 +507,7 @@ unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { return 5; } -unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { +unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { if (VGPRs <= 24) return 10; if (VGPRs <= 28) @@ -453,7 +529,7 @@ unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const { return 1; } -unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { +unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); if (MFI.hasFlatScratchInit()) { if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) @@ -467,7 +543,7 @@ unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { return 2; // VCC. } -unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { +unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { const Function &F = MF.getFunction(); const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); @@ -517,7 +593,7 @@ unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { MaxAddressableNumSGPRs); } -unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { +unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { const Function &F = MF.getFunction(); const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); @@ -532,10 +608,6 @@ unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { unsigned Requested = AMDGPU::getIntegerAttribute( F, "amdgpu-num-vgpr", MaxNumVGPRs); - // Make sure requested value does not violate subtarget's specifications. - if (Requested && Requested <= getReservedNumVGPRs(MF)) - Requested = 0; - // Make sure requested value is compatible with values implied by // default/requested minimum/maximum number of waves per execution unit. if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) @@ -548,7 +620,7 @@ unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { MaxNumVGPRs = Requested; } - return MaxNumVGPRs - getReservedNumVGPRs(MF); + return MaxNumVGPRs; } namespace { @@ -602,7 +674,21 @@ struct MemOpClusterMutation : ScheduleDAGMutation { }; } // namespace -void SISubtarget::getPostRAMutations( +void GCNSubtarget::getPostRAMutations( std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); } + +const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { + if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) + return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>()); + else + return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>()); +} + +const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) { + if (TM.getTargetTriple().getArch() == Triple::amdgcn) + return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F)); + else + return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F)); +} diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index cf4a691d4b58..623109733651 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -8,7 +8,7 @@ //==-----------------------------------------------------------------------===// // /// \file -/// \brief AMDGPU specific subclass of TargetSubtarget. +/// AMDGPU specific subclass of TargetSubtarget. // //===----------------------------------------------------------------------===// @@ -23,7 +23,6 @@ #include "SIFrameLowering.h" #include "SIISelLowering.h" #include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/Triple.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" @@ -40,24 +39,216 @@ #define GET_SUBTARGETINFO_HEADER #include "AMDGPUGenSubtargetInfo.inc" +#define GET_SUBTARGETINFO_HEADER +#include "R600GenSubtargetInfo.inc" namespace llvm { class StringRef; -class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { +class AMDGPUSubtarget { public: enum Generation { R600 = 0, - R700, - EVERGREEN, - NORTHERN_ISLANDS, - SOUTHERN_ISLANDS, - SEA_ISLANDS, - VOLCANIC_ISLANDS, - GFX9, + R700 = 1, + EVERGREEN = 2, + NORTHERN_ISLANDS = 3, + SOUTHERN_ISLANDS = 4, + SEA_ISLANDS = 5, + VOLCANIC_ISLANDS = 6, + GFX9 = 7 }; +private: + Triple TargetTriple; + +protected: + const FeatureBitset &SubtargetFeatureBits; + bool Has16BitInsts; + bool HasMadMixInsts; + bool FP32Denormals; + bool FPExceptions; + bool HasSDWA; + bool HasVOP3PInsts; + bool HasMulI24; + bool HasMulU24; + bool HasFminFmaxLegacy; + bool EnablePromoteAlloca; + int LocalMemorySize; + unsigned WavefrontSize; + +public: + AMDGPUSubtarget(const Triple &TT, const FeatureBitset &FeatureBits); + + static const AMDGPUSubtarget &get(const MachineFunction &MF); + static const AMDGPUSubtarget &get(const TargetMachine &TM, + const Function &F); + + /// \returns Default range flat work group size for a calling convention. + std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const; + + /// \returns Subtarget's default pair of minimum/maximum flat work group sizes + /// for function \p F, or minimum/maximum flat work group sizes explicitly + /// requested using "amdgpu-flat-work-group-size" attribute attached to + /// function \p F. + /// + /// \returns Subtarget's default values if explicitly requested values cannot + /// be converted to integer, or violate subtarget's specifications. + std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const; + + /// \returns Subtarget's default pair of minimum/maximum number of waves per + /// execution unit for function \p F, or minimum/maximum number of waves per + /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute + /// attached to function \p F. + /// + /// \returns Subtarget's default values if explicitly requested values cannot + /// be converted to integer, violate subtarget's specifications, or are not + /// compatible with minimum/maximum number of waves limited by flat work group + /// size, register usage, and/or lds usage. + std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const; + + /// Return the amount of LDS that can be used that will not restrict the + /// occupancy lower than WaveCount. + unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, + const Function &) const; + + /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if + /// the given LDS memory size is the only constraint. + unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; + + unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const; + + bool isAmdHsaOS() const { + return TargetTriple.getOS() == Triple::AMDHSA; + } + + bool isAmdPalOS() const { + return TargetTriple.getOS() == Triple::AMDPAL; + } + + bool isMesa3DOS() const { + return TargetTriple.getOS() == Triple::Mesa3D; + } + + bool isMesaKernel(const Function &F) const { + return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); + } + + bool isAmdCodeObjectV2(const Function &F) const { + return isAmdHsaOS() || isMesaKernel(F); + } + + bool has16BitInsts() const { + return Has16BitInsts; + } + + bool hasMadMixInsts() const { + return HasMadMixInsts; + } + + bool hasFP32Denormals() const { + return FP32Denormals; + } + + bool hasFPExceptions() const { + return FPExceptions; + } + + bool hasSDWA() const { + return HasSDWA; + } + + bool hasVOP3PInsts() const { + return HasVOP3PInsts; + } + + bool hasMulI24() const { + return HasMulI24; + } + + bool hasMulU24() const { + return HasMulU24; + } + + bool hasFminFmaxLegacy() const { + return HasFminFmaxLegacy; + } + + bool isPromoteAllocaEnabled() const { + return EnablePromoteAlloca; + } + + unsigned getWavefrontSize() const { + return WavefrontSize; + } + + int getLocalMemorySize() const { + return LocalMemorySize; + } + + unsigned getAlignmentForImplicitArgPtr() const { + return isAmdHsaOS() ? 8 : 4; + } + + /// Returns the offset in bytes from the start of the input buffer + /// of the first explicit kernel argument. + unsigned getExplicitKernelArgOffset(const Function &F) const { + return isAmdCodeObjectV2(F) ? 0 : 36; + } + + /// \returns Maximum number of work groups per compute unit supported by the + /// subtarget and limited by given \p FlatWorkGroupSize. + unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const { + return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(SubtargetFeatureBits, + FlatWorkGroupSize); + } + + /// \returns Minimum flat work group size supported by the subtarget. + unsigned getMinFlatWorkGroupSize() const { + return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(SubtargetFeatureBits); + } + + /// \returns Maximum flat work group size supported by the subtarget. + unsigned getMaxFlatWorkGroupSize() const { + return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(SubtargetFeatureBits); + } + + /// \returns Maximum number of waves per execution unit supported by the + /// subtarget and limited by given \p FlatWorkGroupSize. + unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const { + return AMDGPU::IsaInfo::getMaxWavesPerEU(SubtargetFeatureBits, + FlatWorkGroupSize); + } + + /// \returns Minimum number of waves per execution unit supported by the + /// subtarget. + unsigned getMinWavesPerEU() const { + return AMDGPU::IsaInfo::getMinWavesPerEU(SubtargetFeatureBits); + } + + unsigned getMaxWavesPerEU() const { return 10; } + + /// Creates value range metadata on an workitemid.* inrinsic call or load. + bool makeLIDRangeMetadata(Instruction *I) const; + + /// \returns Number of bytes of arguments that are passed to a shader or + /// kernel in addition to the explicit ones declared for the function. + unsigned getImplicitArgNumBytes(const Function &F) const { + if (isMesaKernel(F)) + return 16; + return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); + } + uint64_t getExplicitKernArgSize(const Function &F, + unsigned &MaxAlign) const; + unsigned getKernArgSegmentSize(const Function &F, + unsigned &MaxAlign) const; + + virtual ~AMDGPUSubtarget() {} +}; + +class GCNSubtarget : public AMDGPUGenSubtargetInfo, + public AMDGPUSubtarget { +public: enum { ISAVersion0_0_0, ISAVersion6_0_0, @@ -67,13 +258,14 @@ public: ISAVersion7_0_2, ISAVersion7_0_3, ISAVersion7_0_4, - ISAVersion8_0_0, ISAVersion8_0_1, ISAVersion8_0_2, ISAVersion8_0_3, ISAVersion8_1_0, ISAVersion9_0_0, - ISAVersion9_0_2 + ISAVersion9_0_2, + ISAVersion9_0_4, + ISAVersion9_0_6, }; enum TrapHandlerAbi { @@ -96,13 +288,18 @@ public: LLVMTrapHandlerRegValue = 1 }; +private: + /// GlobalISel related APIs. + std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; + std::unique_ptr<InstructionSelector> InstSelector; + std::unique_ptr<LegalizerInfo> Legalizer; + std::unique_ptr<RegisterBankInfo> RegBankInfo; + protected: // Basic subtarget description. Triple TargetTriple; - Generation Gen; + unsigned Gen; unsigned IsaVersion; - unsigned WavefrontSize; - int LocalMemorySize; int LDSBankCount; unsigned MaxPrivateElementSize; @@ -111,9 +308,7 @@ protected: bool HalfRate64Ops; // Dynamially set bits that enable features. - bool FP32Denormals; bool FP64FP16Denormals; - bool FPExceptions; bool DX10Clamp; bool FlatForGlobal; bool AutoWaitcntBeforeBarrier; @@ -124,47 +319,48 @@ protected: bool EnableXNACK; bool TrapHandler; bool DebuggerInsertNops; - bool DebuggerReserveRegs; bool DebuggerEmitPrologue; // Used as options. bool EnableHugePrivateBuffer; bool EnableVGPRSpilling; - bool EnablePromoteAlloca; bool EnableLoadStoreOpt; bool EnableUnsafeDSOffsetFolding; bool EnableSIScheduler; + bool EnableDS128; bool DumpCode; // Subtarget statically properties set by tablegen bool FP64; bool FMA; + bool MIMG_R128; bool IsGCN; bool GCN3Encoding; bool CIInsts; bool GFX9Insts; bool SGPRInitBug; bool HasSMemRealTime; - bool Has16BitInsts; bool HasIntClamp; - bool HasVOP3PInsts; - bool HasMadMixInsts; + bool HasFmaMixInsts; bool HasMovrel; bool HasVGPRIndexMode; bool HasScalarStores; + bool HasScalarAtomics; bool HasInv2PiInlineImm; - bool HasSDWA; bool HasSDWAOmod; bool HasSDWAScalar; bool HasSDWASdst; bool HasSDWAMac; bool HasSDWAOutModsVOPC; bool HasDPP; + bool HasDLInsts; + bool D16PreservesUnusedBits; bool FlatAddressSpace; bool FlatInstOffsets; bool FlatGlobalInsts; bool FlatScratchInsts; bool AddNoCarryInsts; + bool HasUnpackedD16VMem; bool R600ALUInst; bool CaymanISA; bool CFALUBug; @@ -175,67 +371,68 @@ protected: // Dummy feature to use for assembler in tablegen. bool FeatureDisable; - InstrItineraryData InstrItins; SelectionDAGTargetInfo TSInfo; AMDGPUAS AS; +private: + SIInstrInfo InstrInfo; + SITargetLowering TLInfo; + SIFrameLowering FrameLowering; public: - AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, - const TargetMachine &TM); - ~AMDGPUSubtarget() override; + GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, + const GCNTargetMachine &TM); + ~GCNSubtarget() override; - AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT, + GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS); - const AMDGPUInstrInfo *getInstrInfo() const override = 0; - const AMDGPUFrameLowering *getFrameLowering() const override = 0; - const AMDGPUTargetLowering *getTargetLowering() const override = 0; - const AMDGPURegisterInfo *getRegisterInfo() const override = 0; + const SIInstrInfo *getInstrInfo() const override { + return &InstrInfo; + } - const InstrItineraryData *getInstrItineraryData() const override { - return &InstrItins; + const SIFrameLowering *getFrameLowering() const override { + return &FrameLowering; } - // Nothing implemented, just prevent crashes on use. - const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { - return &TSInfo; + const SITargetLowering *getTargetLowering() const override { + return &TLInfo; } - void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + const SIRegisterInfo *getRegisterInfo() const override { + return &InstrInfo.getRegisterInfo(); + } - bool isAmdHsaOS() const { - return TargetTriple.getOS() == Triple::AMDHSA; + const CallLowering *getCallLowering() const override { + return CallLoweringInfo.get(); } - bool isMesa3DOS() const { - return TargetTriple.getOS() == Triple::Mesa3D; + const InstructionSelector *getInstructionSelector() const override { + return InstSelector.get(); } - bool isOpenCLEnv() const { - return TargetTriple.getEnvironment() == Triple::OpenCL || - TargetTriple.getEnvironmentName() == "amdgizcl"; + const LegalizerInfo *getLegalizerInfo() const override { + return Legalizer.get(); } - bool isAmdPalOS() const { - return TargetTriple.getOS() == Triple::AMDPAL; + const RegisterBankInfo *getRegBankInfo() const override { + return RegBankInfo.get(); } - Generation getGeneration() const { - return Gen; + // Nothing implemented, just prevent crashes on use. + const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { + return &TSInfo; } - unsigned getWavefrontSize() const { - return WavefrontSize; + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + Generation getGeneration() const { + return (Generation)Gen; } unsigned getWavefrontSizeLog2() const { return Log2_32(WavefrontSize); } - int getLocalMemorySize() const { - return LocalMemorySize; - } - int getLDSBankCount() const { return LDSBankCount; } @@ -248,19 +445,19 @@ public: return AS; } - bool has16BitInsts() const { - return Has16BitInsts; - } - bool hasIntClamp() const { return HasIntClamp; } - bool hasVOP3PInsts() const { - return HasVOP3PInsts; + bool hasFP64() const { + return FP64; } - bool hasFP64() const { + bool hasMIMG_R128() const { + return MIMG_R128; + } + + bool hasHWFP64() const { return FP64; } @@ -273,15 +470,15 @@ public: } bool hasAddr64() const { - return (getGeneration() < VOLCANIC_ISLANDS); + return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); } bool hasBFE() const { - return (getGeneration() >= EVERGREEN); + return true; } bool hasBFI() const { - return (getGeneration() >= EVERGREEN); + return true; } bool hasBFM() const { @@ -289,62 +486,31 @@ public: } bool hasBCNT(unsigned Size) const { - if (Size == 32) - return (getGeneration() >= EVERGREEN); - - if (Size == 64) - return (getGeneration() >= SOUTHERN_ISLANDS); - - return false; - } - - bool hasMulU24() const { - return (getGeneration() >= EVERGREEN); - } - - bool hasMulI24() const { - return (getGeneration() >= SOUTHERN_ISLANDS || - hasCaymanISA()); + return true; } bool hasFFBL() const { - return (getGeneration() >= EVERGREEN); + return true; } bool hasFFBH() const { - return (getGeneration() >= EVERGREEN); + return true; } bool hasMed3_16() const { - return getGeneration() >= GFX9; + return getGeneration() >= AMDGPUSubtarget::GFX9; } bool hasMin3Max3_16() const { - return getGeneration() >= GFX9; - } - - bool hasMadMixInsts() const { - return HasMadMixInsts; + return getGeneration() >= AMDGPUSubtarget::GFX9; } - bool hasSBufferLoadStoreAtomicDwordxN() const { - // Only use the "x1" variants on GFX9 or don't use the buffer variants. - // For x2 and higher variants, if the accessed region spans 2 VM pages and - // the second page is unmapped, the hw hangs. - // TODO: There is one future GFX9 chip that doesn't have this bug. - return getGeneration() != GFX9; + bool hasFmaMixInsts() const { + return HasFmaMixInsts; } bool hasCARRY() const { - return (getGeneration() >= EVERGREEN); - } - - bool hasBORROW() const { - return (getGeneration() >= EVERGREEN); - } - - bool hasCaymanISA() const { - return CaymanISA; + return true; } bool hasFMA() const { @@ -359,10 +525,6 @@ public: return EnableHugePrivateBuffer; } - bool isPromoteAllocaEnabled() const { - return EnablePromoteAlloca; - } - bool unsafeDSOffsetFoldingEnabled() const { return EnableUnsafeDSOffsetFolding; } @@ -376,23 +538,10 @@ public: unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const; - /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if - /// the given LDS memory size is the only constraint. - unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; - - unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const { - const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); - return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); - } - bool hasFP16Denormals() const { return FP64FP16Denormals; } - bool hasFP32Denormals() const { - return FP32Denormals; - } - bool hasFP64Denormals() const { return FP64FP16Denormals; } @@ -401,10 +550,6 @@ public: return getGeneration() >= AMDGPUSubtarget::GFX9; } - bool hasFPExceptions() const { - return FPExceptions; - } - bool enableDX10Clamp() const { return DX10Clamp; } @@ -417,6 +562,12 @@ public: return FlatForGlobal; } + /// \returns If target supports ds_read/write_b128 and user enables generation + /// of ds_read/write_b128. + bool useDS128() const { + return CIInsts && EnableDS128; + } + /// \returns If MUBUF instructions always perform range checking, even for /// buffer resources used for private memory access. bool privateMemoryResourceIsRangeChecked() const { @@ -440,7 +591,7 @@ public: } bool hasApertureRegs() const { - return HasApertureRegs; + return HasApertureRegs; } bool isTrapHandlerEnabled() const { @@ -467,6 +618,10 @@ public: return FlatScratchInsts; } + bool hasFlatLgkmVMemCountInOrder() const { + return getGeneration() > GFX9; + } + bool hasD16LoadStore() const { return getGeneration() >= GFX9; } @@ -481,31 +636,19 @@ public: return AddNoCarryInsts; } - bool isMesaKernel(const MachineFunction &MF) const { - return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction().getCallingConv()); + bool hasUnpackedD16VMem() const { + return HasUnpackedD16VMem; } // Covers VS/PS/CS graphics shaders - bool isMesaGfxShader(const MachineFunction &MF) const { - return isMesa3DOS() && AMDGPU::isShader(MF.getFunction().getCallingConv()); - } - - bool isAmdCodeObjectV2(const MachineFunction &MF) const { - return isAmdHsaOS() || isMesaKernel(MF); + bool isMesaGfxShader(const Function &F) const { + return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); } bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; } - bool hasFminFmaxLegacy() const { - return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; - } - - bool hasSDWA() const { - return HasSDWA; - } - bool hasSDWAOmod() const { return HasSDWAOmod; } @@ -526,29 +669,28 @@ public: return HasSDWAOutModsVOPC; } - /// \brief Returns the offset in bytes from the start of the input buffer - /// of the first explicit kernel argument. - unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const { - return isAmdCodeObjectV2(MF) ? 0 : 36; + bool vmemWriteNeedsExpWaitcnt() const { + return getGeneration() < SEA_ISLANDS; } - unsigned getAlignmentForImplicitArgPtr() const { - return isAmdHsaOS() ? 8 : 4; + bool hasDLInsts() const { + return HasDLInsts; } - unsigned getImplicitArgNumBytes(const MachineFunction &MF) const { - if (isMesaKernel(MF)) - return 16; - if (isAmdHsaOS() && isOpenCLEnv()) - return 32; - return 0; + bool d16PreservesUnusedBits() const { + return D16PreservesUnusedBits; } // Scratch is allocated in 256 dword per wave blocks for the entire // wavefront. When viewed from the perspecive of an arbitrary workitem, this // is 4-byte aligned. + // + // Only 4-byte alignment is really needed to access anything. Transformations + // on the pointer value itself may rely on the alignment / known low bits of + // the pointer. Set this to something above the minimum to avoid needing + // dynamic realignment in common cases. unsigned getStackAlignment() const { - return 4; + return 16; } bool enableMachineScheduler() const override { @@ -559,184 +701,43 @@ public: return true; } - void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;} - bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;} + void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } + bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } /// \returns Number of execution units per compute unit supported by the /// subtarget. unsigned getEUsPerCU() const { - return AMDGPU::IsaInfo::getEUsPerCU(getFeatureBits()); - } - - /// \returns Maximum number of work groups per compute unit supported by the - /// subtarget and limited by given \p FlatWorkGroupSize. - unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const { - return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(getFeatureBits(), - FlatWorkGroupSize); + return AMDGPU::IsaInfo::getEUsPerCU(MCSubtargetInfo::getFeatureBits()); } /// \returns Maximum number of waves per compute unit supported by the /// subtarget without any kind of limitation. unsigned getMaxWavesPerCU() const { - return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits()); + return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits()); } /// \returns Maximum number of waves per compute unit supported by the /// subtarget and limited by given \p FlatWorkGroupSize. unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const { - return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits(), + return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits(), FlatWorkGroupSize); } - /// \returns Minimum number of waves per execution unit supported by the - /// subtarget. - unsigned getMinWavesPerEU() const { - return AMDGPU::IsaInfo::getMinWavesPerEU(getFeatureBits()); - } - /// \returns Maximum number of waves per execution unit supported by the /// subtarget without any kind of limitation. unsigned getMaxWavesPerEU() const { - return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits()); - } - - /// \returns Maximum number of waves per execution unit supported by the - /// subtarget and limited by given \p FlatWorkGroupSize. - unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const { - return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits(), - FlatWorkGroupSize); - } - - /// \returns Minimum flat work group size supported by the subtarget. - unsigned getMinFlatWorkGroupSize() const { - return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(getFeatureBits()); - } - - /// \returns Maximum flat work group size supported by the subtarget. - unsigned getMaxFlatWorkGroupSize() const { - return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(getFeatureBits()); + return AMDGPU::IsaInfo::getMaxWavesPerEU(); } /// \returns Number of waves per work group supported by the subtarget and /// limited by given \p FlatWorkGroupSize. unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const { - return AMDGPU::IsaInfo::getWavesPerWorkGroup(getFeatureBits(), - FlatWorkGroupSize); + return AMDGPU::IsaInfo::getWavesPerWorkGroup( + MCSubtargetInfo::getFeatureBits(), FlatWorkGroupSize); } - /// \returns Default range flat work group size for a calling convention. - std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const; - - /// \returns Subtarget's default pair of minimum/maximum flat work group sizes - /// for function \p F, or minimum/maximum flat work group sizes explicitly - /// requested using "amdgpu-flat-work-group-size" attribute attached to - /// function \p F. - /// - /// \returns Subtarget's default values if explicitly requested values cannot - /// be converted to integer, or violate subtarget's specifications. - std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const; - - /// \returns Subtarget's default pair of minimum/maximum number of waves per - /// execution unit for function \p F, or minimum/maximum number of waves per - /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute - /// attached to function \p F. - /// - /// \returns Subtarget's default values if explicitly requested values cannot - /// be converted to integer, violate subtarget's specifications, or are not - /// compatible with minimum/maximum number of waves limited by flat work group - /// size, register usage, and/or lds usage. - std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const; - - /// Creates value range metadata on an workitemid.* inrinsic call or load. - bool makeLIDRangeMetadata(Instruction *I) const; -}; - -class R600Subtarget final : public AMDGPUSubtarget { -private: - R600InstrInfo InstrInfo; - R600FrameLowering FrameLowering; - R600TargetLowering TLInfo; - -public: - R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, - const TargetMachine &TM); - - const R600InstrInfo *getInstrInfo() const override { - return &InstrInfo; - } - - const R600FrameLowering *getFrameLowering() const override { - return &FrameLowering; - } - - const R600TargetLowering *getTargetLowering() const override { - return &TLInfo; - } - - const R600RegisterInfo *getRegisterInfo() const override { - return &InstrInfo.getRegisterInfo(); - } - - bool hasCFAluBug() const { - return CFALUBug; - } - - bool hasVertexCache() const { - return HasVertexCache; - } - - short getTexVTXClauseSize() const { - return TexVTXClauseSize; - } -}; - -class SISubtarget final : public AMDGPUSubtarget { -private: - SIInstrInfo InstrInfo; - SIFrameLowering FrameLowering; - SITargetLowering TLInfo; - - /// GlobalISel related APIs. - std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; - std::unique_ptr<InstructionSelector> InstSelector; - std::unique_ptr<LegalizerInfo> Legalizer; - std::unique_ptr<RegisterBankInfo> RegBankInfo; - -public: - SISubtarget(const Triple &TT, StringRef CPU, StringRef FS, - const TargetMachine &TM); - - const SIInstrInfo *getInstrInfo() const override { - return &InstrInfo; - } - - const SIFrameLowering *getFrameLowering() const override { - return &FrameLowering; - } - - const SITargetLowering *getTargetLowering() const override { - return &TLInfo; - } - - const CallLowering *getCallLowering() const override { - return CallLoweringInfo.get(); - } - - const InstructionSelector *getInstructionSelector() const override { - return InstSelector.get(); - } - - const LegalizerInfo *getLegalizerInfo() const override { - return Legalizer.get(); - } - - const RegisterBankInfo *getRegBankInfo() const override { - return RegBankInfo.get(); - } - - const SIRegisterInfo *getRegisterInfo() const override { - return &InstrInfo.getRegisterInfo(); - } + // static wrappers + static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); // XXX - Why is this here if it isn't in the default pass set? bool enableEarlyIfConversion() const override { @@ -746,7 +747,7 @@ public: void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override; - bool isVGPRSpillingEnabled(const Function& F) const; + bool isVGPRSpillingEnabled(const Function &F) const; unsigned getMaxNumUserSGPRs() const { return 16; @@ -776,6 +777,10 @@ public: return HasScalarStores; } + bool hasScalarAtomics() const { + return HasScalarAtomics; + } + bool hasInv2PiInlineImm() const { return HasInv2PiInlineImm; } @@ -789,18 +794,13 @@ public: } bool debuggerSupported() const { - return debuggerInsertNops() && debuggerReserveRegs() && - debuggerEmitPrologue(); + return debuggerInsertNops() && debuggerEmitPrologue(); } bool debuggerInsertNops() const { return DebuggerInsertNops; } - bool debuggerReserveRegs() const { - return DebuggerReserveRegs; - } - bool debuggerEmitPrologue() const { return DebuggerEmitPrologue; } @@ -829,52 +829,61 @@ public: return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS; } - unsigned getKernArgSegmentSize(const MachineFunction &MF, - unsigned ExplictArgBytes) const; - - /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs + /// SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; - /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs + /// Return the maximum number of waves per SIMD for kernels using \p VGPRs + /// VGPRs unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; /// \returns true if the flat_scratch register should be initialized with the /// pointer to the wave's scratch memory rather than a size and offset. bool flatScratchIsPointer() const { + return getGeneration() >= AMDGPUSubtarget::GFX9; + } + + /// \returns true if the machine has merged shaders in which s0-s7 are + /// reserved by the hardware and user SGPRs start at s8 + bool hasMergedShaders() const { return getGeneration() >= GFX9; } /// \returns SGPR allocation granularity supported by the subtarget. unsigned getSGPRAllocGranule() const { - return AMDGPU::IsaInfo::getSGPRAllocGranule(getFeatureBits()); + return AMDGPU::IsaInfo::getSGPRAllocGranule( + MCSubtargetInfo::getFeatureBits()); } /// \returns SGPR encoding granularity supported by the subtarget. unsigned getSGPREncodingGranule() const { - return AMDGPU::IsaInfo::getSGPREncodingGranule(getFeatureBits()); + return AMDGPU::IsaInfo::getSGPREncodingGranule( + MCSubtargetInfo::getFeatureBits()); } /// \returns Total number of SGPRs supported by the subtarget. unsigned getTotalNumSGPRs() const { - return AMDGPU::IsaInfo::getTotalNumSGPRs(getFeatureBits()); + return AMDGPU::IsaInfo::getTotalNumSGPRs(MCSubtargetInfo::getFeatureBits()); } /// \returns Addressable number of SGPRs supported by the subtarget. unsigned getAddressableNumSGPRs() const { - return AMDGPU::IsaInfo::getAddressableNumSGPRs(getFeatureBits()); + return AMDGPU::IsaInfo::getAddressableNumSGPRs( + MCSubtargetInfo::getFeatureBits()); } /// \returns Minimum number of SGPRs that meets the given number of waves per /// execution unit requirement supported by the subtarget. unsigned getMinNumSGPRs(unsigned WavesPerEU) const { - return AMDGPU::IsaInfo::getMinNumSGPRs(getFeatureBits(), WavesPerEU); + return AMDGPU::IsaInfo::getMinNumSGPRs(MCSubtargetInfo::getFeatureBits(), + WavesPerEU); } /// \returns Maximum number of SGPRs that meets the given number of waves per /// execution unit requirement supported by the subtarget. unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { - return AMDGPU::IsaInfo::getMaxNumSGPRs(getFeatureBits(), WavesPerEU, - Addressable); + return AMDGPU::IsaInfo::getMaxNumSGPRs(MCSubtargetInfo::getFeatureBits(), + WavesPerEU, Addressable); } /// \returns Reserved number of SGPRs for given function \p MF. @@ -892,39 +901,39 @@ public: /// \returns VGPR allocation granularity supported by the subtarget. unsigned getVGPRAllocGranule() const { - return AMDGPU::IsaInfo::getVGPRAllocGranule(getFeatureBits()); + return AMDGPU::IsaInfo::getVGPRAllocGranule( + MCSubtargetInfo::getFeatureBits()); } /// \returns VGPR encoding granularity supported by the subtarget. unsigned getVGPREncodingGranule() const { - return AMDGPU::IsaInfo::getVGPREncodingGranule(getFeatureBits()); + return AMDGPU::IsaInfo::getVGPREncodingGranule( + MCSubtargetInfo::getFeatureBits()); } /// \returns Total number of VGPRs supported by the subtarget. unsigned getTotalNumVGPRs() const { - return AMDGPU::IsaInfo::getTotalNumVGPRs(getFeatureBits()); + return AMDGPU::IsaInfo::getTotalNumVGPRs(MCSubtargetInfo::getFeatureBits()); } /// \returns Addressable number of VGPRs supported by the subtarget. unsigned getAddressableNumVGPRs() const { - return AMDGPU::IsaInfo::getAddressableNumVGPRs(getFeatureBits()); + return AMDGPU::IsaInfo::getAddressableNumVGPRs( + MCSubtargetInfo::getFeatureBits()); } /// \returns Minimum number of VGPRs that meets given number of waves per /// execution unit requirement supported by the subtarget. unsigned getMinNumVGPRs(unsigned WavesPerEU) const { - return AMDGPU::IsaInfo::getMinNumVGPRs(getFeatureBits(), WavesPerEU); + return AMDGPU::IsaInfo::getMinNumVGPRs(MCSubtargetInfo::getFeatureBits(), + WavesPerEU); } /// \returns Maximum number of VGPRs that meets given number of waves per /// execution unit requirement supported by the subtarget. unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { - return AMDGPU::IsaInfo::getMaxNumVGPRs(getFeatureBits(), WavesPerEU); - } - - /// \returns Reserved number of VGPRs for given function \p MF. - unsigned getReservedNumVGPRs(const MachineFunction &MF) const { - return debuggerReserveRegs() ? 4 : 0; + return AMDGPU::IsaInfo::getMaxNumVGPRs(MCSubtargetInfo::getFeatureBits(), + WavesPerEU); } /// \returns Maximum number of VGPRs that meets number of waves per execution @@ -942,6 +951,119 @@ public: const override; }; +class R600Subtarget final : public R600GenSubtargetInfo, + public AMDGPUSubtarget { +private: + R600InstrInfo InstrInfo; + R600FrameLowering FrameLowering; + bool FMA; + bool CaymanISA; + bool CFALUBug; + bool DX10Clamp; + bool HasVertexCache; + bool R600ALUInst; + bool FP64; + short TexVTXClauseSize; + Generation Gen; + R600TargetLowering TLInfo; + InstrItineraryData InstrItins; + SelectionDAGTargetInfo TSInfo; + AMDGPUAS AS; + +public: + R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, + const TargetMachine &TM); + + const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; } + + const R600FrameLowering *getFrameLowering() const override { + return &FrameLowering; + } + + const R600TargetLowering *getTargetLowering() const override { + return &TLInfo; + } + + const R600RegisterInfo *getRegisterInfo() const override { + return &InstrInfo.getRegisterInfo(); + } + + const InstrItineraryData *getInstrItineraryData() const override { + return &InstrItins; + } + + // Nothing implemented, just prevent crashes on use. + const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { + return &TSInfo; + } + + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + Generation getGeneration() const { + return Gen; + } + + unsigned getStackAlignment() const { + return 4; + } + + R600Subtarget &initializeSubtargetDependencies(const Triple &TT, + StringRef GPU, StringRef FS); + + bool hasBFE() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasBFI() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasBCNT(unsigned Size) const { + if (Size == 32) + return (getGeneration() >= EVERGREEN); + + return false; + } + + bool hasBORROW() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasCARRY() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasCaymanISA() const { + return CaymanISA; + } + + bool hasFFBL() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasFFBH() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasFMA() const { return FMA; } + + bool hasCFAluBug() const { return CFALUBug; } + + bool hasVertexCache() const { return HasVertexCache; } + + short getTexVTXClauseSize() const { return TexVTXClauseSize; } + + AMDGPUAS getAMDGPUAS() const { return AS; } + + bool enableMachineScheduler() const override { + return true; + } + + bool enableSubRegLiveness() const override { + return true; + } +}; + } // end namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 2042dbf6d5e2..2205819c444f 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief The AMDGPU target machine contains all of the hardware specific +/// The AMDGPU target machine contains all of the hardware specific /// information needed to emit code for R600 and SI GPUs. // //===----------------------------------------------------------------------===// @@ -31,7 +31,6 @@ #include "llvm/CodeGen/GlobalISel/Legalizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/TargetLoweringObjectFile.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" @@ -40,6 +39,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" @@ -79,7 +79,7 @@ static cl::opt<bool> EnableLoadStoreVectorizer( cl::init(true), cl::Hidden); -// Option to to control global loads scalarization +// Option to control global loads scalarization static cl::opt<bool> ScalarizeGlobal( "amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), @@ -110,12 +110,6 @@ static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, cl::desc("Enable AMDGPU Alias Analysis"), cl::init(true)); -// Option to enable new waitcnt insertion pass. -static cl::opt<bool> EnableSIInsertWaitcntsPass( - "enable-si-insert-waitcnts", - cl::desc("Use new waitcnt insertion pass"), - cl::init(true)); - // Option to run late CFG structurizer static cl::opt<bool, true> LateCFGStructurize( "amdgpu-late-structurize", @@ -123,16 +117,23 @@ static cl::opt<bool, true> LateCFGStructurize( cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), cl::Hidden); -static cl::opt<bool> EnableAMDGPUFunctionCalls( +static cl::opt<bool, true> EnableAMDGPUFunctionCalls( "amdgpu-function-calls", - cl::Hidden, cl::desc("Enable AMDGPU function call support"), - cl::init(false)); + cl::location(AMDGPUTargetMachine::EnableFunctionCalls), + cl::init(false), + cl::Hidden); // Enable lib calls simplifications static cl::opt<bool> EnableLibCallSimplify( "amdgpu-simplify-libcall", - cl::desc("Enable mdgpu library simplifications"), + cl::desc("Enable amdgpu library simplifications"), + cl::init(true), + cl::Hidden); + +static cl::opt<bool> EnableLowerKernelArguments( + "amdgpu-ir-lower-kernel-arguments", + cl::desc("Lower kernel argument loads in IR pass"), cl::init(true), cl::Hidden); @@ -147,6 +148,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeR600PacketizerPass(*PR); initializeR600ExpandSpecialInstrsPassPass(*PR); initializeR600VectorRegMergerPass(*PR); + initializeGlobalISel(*PR); initializeAMDGPUDAGToDAGISelPass(*PR); initializeSILowerI1CopiesPass(*PR); initializeSIFixSGPRCopiesPass(*PR); @@ -160,6 +162,8 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); initializeAMDGPUArgumentUsageInfoPass(*PR); + initializeAMDGPULowerKernelArgumentsPass(*PR); + initializeAMDGPULowerKernelAttributesPass(*PR); initializeAMDGPULowerIntrinsicsPass(*PR); initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); @@ -167,7 +171,6 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); - initializeSIInsertWaitsPass(*PR); initializeSIInsertWaitcntsPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); @@ -176,6 +179,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeSIDebuggerInsertNopsPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); initializeSIFixWWMLivenessPass(*PR); + initializeSIFormMemoryClausesPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); initializeAMDGPUUseNativeCallsPass(*PR); @@ -260,24 +264,15 @@ GCNILPSchedRegistry("gcn-ilp", static StringRef computeDataLayout(const Triple &TT) { if (TT.getArch() == Triple::r600) { // 32-bit pointers. - if (TT.getEnvironmentName() == "amdgiz" || - TT.getEnvironmentName() == "amdgizcl") return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" - "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"; - return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" - "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"; } // 32-bit private, local, and region pointers. 64-bit global, constant and // flat. - if (TT.getEnvironmentName() == "amdgiz" || - TT.getEnvironmentName() == "amdgizcl") - return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32" + return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" - "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"; - return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" - "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" - "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"; } LLVM_READNONE @@ -317,9 +312,10 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, initAsmInfo(); } -AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; - bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; +bool AMDGPUTargetMachine::EnableFunctionCalls = false; + +AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { Attribute GPUAttr = F.getFnAttribute("target-cpu"); @@ -412,6 +408,10 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { // Add infer address spaces pass to the opt pipeline after inlining // but before SROA to increase SROA opportunities. PM.add(createInferAddressSpacesPass()); + + // This should run after inlining to have any chance of doing anything, + // and before other cleanup optimizations. + PM.add(createAMDGPULowerKernelAttributesPass()); }); } @@ -449,6 +449,11 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl( return I.get(); } +TargetTransformInfo +R600TargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(R600TTIImpl(this, F)); +} + //===----------------------------------------------------------------------===// // GCN Target Machine (SI+) //===----------------------------------------------------------------------===// @@ -461,7 +466,7 @@ GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OL, bool JIT) : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} -const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { +const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { StringRef GPU = getGPUName(F); StringRef FS = getFeatureString(F); @@ -474,7 +479,7 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this); + I = llvm::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this); } I->setScalarizeGlobalBehavior(ScalarizeGlobal); @@ -482,6 +487,11 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { return I.get(); } +TargetTransformInfo +GCNTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(GCNTTIImpl(this, F)); +} + //===----------------------------------------------------------------------===// // AMDGPU Pass Setup //===----------------------------------------------------------------------===// @@ -571,11 +581,6 @@ public: } // end anonymous namespace -TargetTransformInfo -AMDGPUTargetMachine::getTargetTransformInfo(const Function &F) { - return TargetTransformInfo(AMDGPUTTIImpl(this, F)); -} - void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { if (getOptLevel() == CodeGenOpt::Aggressive) addPass(createGVNPass()); @@ -584,6 +589,7 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { } void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { + addPass(createLICMPass()); addPass(createSeparateConstOffsetFromGEPPass()); addPass(createSpeculativeExecutionPass()); // ReassociateGEPs exposes more opportunites for SLSR. See @@ -629,7 +635,8 @@ void AMDGPUPassConfig::addIRPasses() { } // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. - addPass(createAMDGPUOpenCLImageTypeLoweringPass()); + if (TM.getTargetTriple().getArch() == Triple::r600) + addPass(createR600OpenCLImageTypeLoweringPass()); // Replace OpenCL enqueued block function pointers with global variables. addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); @@ -672,6 +679,10 @@ void AMDGPUPassConfig::addIRPasses() { } void AMDGPUPassConfig::addCodeGenPrepare() { + if (TM->getTargetTriple().getArch() == Triple::amdgcn && + EnableLowerKernelArguments) + addPass(createAMDGPULowerKernelArgumentsPass()); + TargetPassConfig::addCodeGenPrepare(); if (EnableLoadStoreVectorizer) @@ -739,7 +750,7 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( MachineSchedContext *C) const { - const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); if (ST.enableSIScheduler()) return createSIMachineScheduler(C); return createGCNMaxOccupancyMachineScheduler(C); @@ -782,7 +793,7 @@ void GCNPassConfig::addMachineSSAOptimization() { addPass(&SILoadStoreOptimizerID); if (EnableSDWAPeephole) { addPass(&SIPeepholeSDWAID); - addPass(&MachineLICMID); + addPass(&EarlyMachineLICMID); addPass(&MachineCSEID); addPass(&SIFoldOperandsID); addPass(&DeadMachineInstructionElimID); @@ -851,6 +862,8 @@ void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); + insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID); + // This must be run immediately after phi elimination and before // TwoAddressInstructions, otherwise the processing of the tied operand of // SI_ELSE will introduce a copy of the tied operand source after the else. @@ -873,6 +886,10 @@ void GCNPassConfig::addPreSched2() { } void GCNPassConfig::addPreEmitPass() { + addPass(createSIMemoryLegalizerPass()); + addPass(createSIInsertWaitcntsPass()); + addPass(createSIShrinkInstructionsPass()); + // The hazard recognizer that runs as part of the post-ra scheduler does not // guarantee to be able handle all hazards correctly. This is because if there // are multiple scheduling regions in a basic block, the regions are scheduled @@ -881,15 +898,12 @@ void GCNPassConfig::addPreEmitPass() { // // Here we add a stand-alone hazard recognizer pass which can handle all // cases. + // + // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would + // be better for it to emit S_NOP <N> when possible. addPass(&PostRAHazardRecognizerID); - if (EnableSIInsertWaitcntsPass) - addPass(createSIInsertWaitcntsPass()); - else - addPass(createSIInsertWaitsPass()); - addPass(createSIShrinkInstructionsPass()); addPass(&SIInsertSkipsPassID); - addPass(createSIMemoryLegalizerPass()); addPass(createSIDebuggerInsertNopsPass()); addPass(&BranchRelaxationPassID); } diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 5f9b2a7fca20..0fe14493fabd 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets. +/// The AMDGPU TargetMachine interface definition for hw codgen targets. // //===----------------------------------------------------------------------===// @@ -34,7 +34,6 @@ namespace llvm { class AMDGPUTargetMachine : public LLVMTargetMachine { protected: std::unique_ptr<TargetLoweringObjectFile> TLOF; - AMDGPUIntrinsicInfo IntrinsicInfo; AMDGPUAS AS; StringRef getGPUName(const Function &F) const; @@ -42,6 +41,7 @@ protected: public: static bool EnableLateStructurizeCFG; + static bool EnableFunctionCalls; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, @@ -49,13 +49,8 @@ public: CodeGenOpt::Level OL); ~AMDGPUTargetMachine() override; - const AMDGPUSubtarget *getSubtargetImpl() const; - const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override = 0; - - const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { - return &IntrinsicInfo; - } - TargetTransformInfo getTargetTransformInfo(const Function &F) override; + const TargetSubtargetInfo *getSubtargetImpl() const; + const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override = 0; TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); @@ -91,6 +86,8 @@ public: const R600Subtarget *getSubtargetImpl(const Function &) const override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; + bool isMachineVerifierClean() const override { return false; } @@ -102,7 +99,8 @@ public: class GCNTargetMachine final : public AMDGPUTargetMachine { private: - mutable StringMap<std::unique_ptr<SISubtarget>> SubtargetMap; + AMDGPUIntrinsicInfo IntrinsicInfo; + mutable StringMap<std::unique_ptr<GCNSubtarget>> SubtargetMap; public: GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU, @@ -112,7 +110,13 @@ public: TargetPassConfig *createPassConfig(PassManagerBase &PM) override; - const SISubtarget *getSubtargetImpl(const Function &) const override; + const GCNSubtarget *getSubtargetImpl(const Function &) const override; + + TargetTransformInfo getTargetTransformInfo(const Function &F) override; + + const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { + return &IntrinsicInfo; + } bool useIPRA() const override { return true; diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h index ca6210f69298..dd9dc1a88fc2 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h +++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// /// /// \file -/// \brief This file declares the AMDGPU-specific subclass of +/// This file declares the AMDGPU-specific subclass of /// TargetLoweringObjectFile. /// //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 77c2d4b956c6..a68b8d03f06e 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -17,12 +17,12 @@ #include "AMDGPUTargetTransformInfo.h" #include "AMDGPUSubtarget.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/ISDOpcodes.h" -#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" @@ -43,6 +43,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MachineValueType.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include <algorithm> @@ -101,7 +102,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, unsigned ThresholdPrivate = UnrollThresholdPrivate; unsigned ThresholdLocal = UnrollThresholdLocal; unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); - AMDGPUAS ASST = ST->getAMDGPUAS(); + const AMDGPUAS &ASST = AMDGPU::getAMDGPUAS(TargetTriple); for (const BasicBlock *BB : L->getBlocks()) { const DataLayout &DL = BB->getModule()->getDataLayout(); unsigned LocalGEPsSeen = 0; @@ -123,8 +124,9 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, continue; if (dependsOnLocalPhi(L, Br->getCondition())) { UP.Threshold += UnrollThresholdIf; - DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold - << " for loop:\n" << *L << " due to " << *Br << '\n'); + LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold + << " for loop:\n" + << *L << " due to " << *Br << '\n'); if (UP.Threshold >= MaxBoost) return; } @@ -200,61 +202,76 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, // Don't use the maximum allowed value here as it will make some // programs way too big. UP.Threshold = Threshold; - DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n" - << *L << " due to " << *GEP << '\n'); + LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold + << " for loop:\n" + << *L << " due to " << *GEP << '\n'); if (UP.Threshold >= MaxBoost) return; } } } -unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const { +unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const { // The concept of vector registers doesn't really exist. Some packed vector // operations operate on the normal 32-bit registers. - - // Number of VGPRs on SI. - if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) - return 256; - - return 4 * 128; // XXX - 4 channels. Should these count as vector instead? + return 256; } -unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const { +unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const { // This is really the number of registers to fill when vectorizing / // interleaving loops, so we lie to avoid trying to use all registers. return getHardwareNumberOfRegisters(Vec) >> 3; } -unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const { +unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const { return 32; } -unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const { +unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { return 32; } -unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { +unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, + unsigned ChainSizeInBytes, + VectorType *VecTy) const { + unsigned VecRegBitWidth = VF * LoadSize; + if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32) + // TODO: Support element-size less than 32bit? + return 128 / LoadSize; + + return VF; +} + +unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize, + unsigned ChainSizeInBytes, + VectorType *VecTy) const { + unsigned VecRegBitWidth = VF * StoreSize; + if (VecRegBitWidth > 128) + return 128 / StoreSize; + + return VF; +} + +unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { AMDGPUAS AS = ST->getAMDGPUAS(); if (AddrSpace == AS.GLOBAL_ADDRESS || AddrSpace == AS.CONSTANT_ADDRESS || - AddrSpace == AS.FLAT_ADDRESS) - return 128; - if (AddrSpace == AS.LOCAL_ADDRESS || + AddrSpace == AS.CONSTANT_ADDRESS_32BIT) { + return 512; + } + + if (AddrSpace == AS.FLAT_ADDRESS || + AddrSpace == AS.LOCAL_ADDRESS || AddrSpace == AS.REGION_ADDRESS) - return 64; + return 128; + if (AddrSpace == AS.PRIVATE_ADDRESS) return 8 * ST->getMaxPrivateElementSize(); - if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && - (AddrSpace == AS.PARAM_D_ADDRESS || - AddrSpace == AS.PARAM_I_ADDRESS || - (AddrSpace >= AS.CONSTANT_BUFFER_0 && - AddrSpace <= AS.CONSTANT_BUFFER_15))) - return 128; llvm_unreachable("unhandled address space"); } -bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, +bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const { // We allow vectorization of flat stores, even though we may need to decompose @@ -267,19 +284,19 @@ bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, return true; } -bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, +bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const { return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); } -bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, +bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const { return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); } -unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { +unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) { // Disable unrolling if the loop is not vectorized. // TODO: Enable this again. if (VF == 1) @@ -288,11 +305,14 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { return 8; } -bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, +bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const { switch (Inst->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: { + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: { auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2)); auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4)); if (!Ordering || !Volatile) @@ -314,7 +334,7 @@ bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, } } -int AMDGPUTTIImpl::getArithmeticInstrCost( +int GCNTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) { @@ -424,7 +444,7 @@ int AMDGPUTTIImpl::getArithmeticInstrCost( Opd1PropInfo, Opd2PropInfo); } -unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) { +unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) { // XXX - For some reason this isn't called for switch. switch (Opcode) { case Instruction::Br: @@ -435,7 +455,38 @@ unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) { } } -int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, +int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty, + bool IsPairwise) { + EVT OrigTy = TLI->getValueType(DL, Ty); + + // Computes cost on targets that have packed math instructions(which support + // 16-bit types only). + if (IsPairwise || + !ST->hasVOP3PInsts() || + OrigTy.getScalarSizeInBits() != 16) + return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise); + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + return LT.first * getFullRateInstrCost(); +} + +int GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy, + bool IsPairwise, + bool IsUnsigned) { + EVT OrigTy = TLI->getValueType(DL, Ty); + + // Computes cost on targets that have packed math instructions(which support + // 16-bit types only). + if (IsPairwise || + !ST->hasVOP3PInsts() || + OrigTy.getScalarSizeInBits() != 16) + return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned); + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + return LT.first * getHalfRateInstrCost(); +} + +int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index) { switch (Opcode) { case Instruction::ExtractElement: @@ -460,52 +511,7 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, } } -static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) { - switch (I->getIntrinsicID()) { - case Intrinsic::amdgcn_workitem_id_x: - case Intrinsic::amdgcn_workitem_id_y: - case Intrinsic::amdgcn_workitem_id_z: - case Intrinsic::amdgcn_interp_mov: - case Intrinsic::amdgcn_interp_p1: - case Intrinsic::amdgcn_interp_p2: - case Intrinsic::amdgcn_mbcnt_hi: - case Intrinsic::amdgcn_mbcnt_lo: - case Intrinsic::r600_read_tidig_x: - case Intrinsic::r600_read_tidig_y: - case Intrinsic::r600_read_tidig_z: - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: - case Intrinsic::amdgcn_image_atomic_swap: - case Intrinsic::amdgcn_image_atomic_add: - case Intrinsic::amdgcn_image_atomic_sub: - case Intrinsic::amdgcn_image_atomic_smin: - case Intrinsic::amdgcn_image_atomic_umin: - case Intrinsic::amdgcn_image_atomic_smax: - case Intrinsic::amdgcn_image_atomic_umax: - case Intrinsic::amdgcn_image_atomic_and: - case Intrinsic::amdgcn_image_atomic_or: - case Intrinsic::amdgcn_image_atomic_xor: - case Intrinsic::amdgcn_image_atomic_inc: - case Intrinsic::amdgcn_image_atomic_dec: - case Intrinsic::amdgcn_image_atomic_cmpswap: - case Intrinsic::amdgcn_buffer_atomic_swap: - case Intrinsic::amdgcn_buffer_atomic_add: - case Intrinsic::amdgcn_buffer_atomic_sub: - case Intrinsic::amdgcn_buffer_atomic_smin: - case Intrinsic::amdgcn_buffer_atomic_umin: - case Intrinsic::amdgcn_buffer_atomic_smax: - case Intrinsic::amdgcn_buffer_atomic_umax: - case Intrinsic::amdgcn_buffer_atomic_and: - case Intrinsic::amdgcn_buffer_atomic_or: - case Intrinsic::amdgcn_buffer_atomic_xor: - case Intrinsic::amdgcn_buffer_atomic_cmpswap: - case Intrinsic::amdgcn_ps_live: - case Intrinsic::amdgcn_ds_swizzle: - return true; - default: - return false; - } -} + static bool isArgPassedInSGPR(const Argument *A) { const Function *F = A->getParent(); @@ -535,7 +541,7 @@ static bool isArgPassedInSGPR(const Argument *A) { /// \returns true if the result of the value could potentially be /// different across workitems in a wavefront. -bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { +bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { if (const Argument *A = dyn_cast<Argument>(V)) return !isArgPassedInSGPR(A); @@ -556,7 +562,7 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { return true; if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) - return isIntrinsicSourceOfDivergence(Intrinsic); + return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID()); // Assume all function calls are a source of divergence. if (isa<CallInst>(V) || isa<InvokeInst>(V)) @@ -565,7 +571,7 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { return false; } -bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const { +bool GCNTTIImpl::isAlwaysUniform(const Value *V) const { if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { switch (Intrinsic->getIntrinsicID()) { default: @@ -578,7 +584,7 @@ bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const { return false; } -unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, +unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { if (ST->hasVOP3PInsts()) { VectorType *VT = cast<VectorType>(Tp); @@ -601,7 +607,7 @@ unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Inde return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } -bool AMDGPUTTIImpl::areInlineCompatible(const Function *Caller, +bool GCNTTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); const FeatureBitset &CallerBits = @@ -613,3 +619,114 @@ bool AMDGPUTTIImpl::areInlineCompatible(const Function *Caller, FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; return ((RealCallerBits & RealCalleeBits) == RealCalleeBits); } + +void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP) { + CommonTTI.getUnrollingPreferences(L, SE, UP); +} + +unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const { + return 4 * 128; // XXX - 4 channels. Should these count as vector instead? +} + +unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const { + return getHardwareNumberOfRegisters(Vec); +} + +unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const { + return 32; +} + +unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const { + return 32; +} + +unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { + AMDGPUAS AS = ST->getAMDGPUAS(); + if (AddrSpace == AS.GLOBAL_ADDRESS || + AddrSpace == AS.CONSTANT_ADDRESS) + return 128; + if (AddrSpace == AS.LOCAL_ADDRESS || + AddrSpace == AS.REGION_ADDRESS) + return 64; + if (AddrSpace == AS.PRIVATE_ADDRESS) + return 32; + + if ((AddrSpace == AS.PARAM_D_ADDRESS || + AddrSpace == AS.PARAM_I_ADDRESS || + (AddrSpace >= AS.CONSTANT_BUFFER_0 && + AddrSpace <= AS.CONSTANT_BUFFER_15))) + return 128; + llvm_unreachable("unhandled address space"); +} + +bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const { + // We allow vectorization of flat stores, even though we may need to decompose + // them later if they may access private memory. We don't have enough context + // here, and legalization can handle it. + if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) + return false; + return true; +} + +bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const { + return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); +} + +bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const { + return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); +} + +unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) { + // Disable unrolling if the loop is not vectorized. + // TODO: Enable this again. + if (VF == 1) + return 1; + + return 8; +} + +unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) { + // XXX - For some reason this isn't called for switch. + switch (Opcode) { + case Instruction::Br: + case Instruction::Ret: + return 10; + default: + return BaseT::getCFInstrCost(Opcode); + } +} + +int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, + unsigned Index) { + switch (Opcode) { + case Instruction::ExtractElement: + case Instruction::InsertElement: { + unsigned EltSize + = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType()); + if (EltSize < 32) { + return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + } + + // Extracts are just reads of a subregister, so are free. Inserts are + // considered free because we don't want to have any cost for scalarizing + // operations, and we don't have to copy into a different register class. + + // Dynamic indexing isn't free and is best avoided. + return Index == ~0u ? 2 : 0; + } + default: + return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + } +} + +void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP) { + CommonTTI.getUnrollingPreferences(L, SE, UP); +} diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 8899d2c6da8a..8e63d789e17d 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -21,6 +21,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -44,8 +45,26 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { friend BaseT; - const AMDGPUSubtarget *ST; + Triple TargetTriple; + +public: + explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) + : BaseT(TM, F.getParent()->getDataLayout()), + TargetTriple(TM->getTargetTriple()) {} + + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP); +}; + +class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { + using BaseT = BasicTTIImplBase<GCNTTIImpl>; + using TTI = TargetTransformInfo; + + friend BaseT; + + const GCNSubtarget *ST; const AMDGPUTargetLowering *TLI; + AMDGPUTTIImpl CommonTTI; bool IsGraphicsShader; const FeatureBitset InlineFeatureIgnoreList = { @@ -61,7 +80,6 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { AMDGPU::FeatureAutoWaitcntBeforeBarrier, AMDGPU::FeatureDebuggerEmitPrologue, AMDGPU::FeatureDebuggerInsertNops, - AMDGPU::FeatureDebuggerReserveRegs, // Property of the kernel/environment which can't actually differ. AMDGPU::FeatureSGPRInitBug, @@ -73,7 +91,7 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { AMDGPU::HalfRate64Ops }; - const AMDGPUSubtarget *getST() const { return ST; } + const GCNSubtarget *getST() const { return ST; } const AMDGPUTargetLowering *getTLI() const { return TLI; } static inline int getFullRateInstrCost() { @@ -98,10 +116,11 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { } public: - explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) + explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), - ST(TM->getSubtargetImpl(F)), + ST(static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F))), TLI(ST->getTargetLowering()), + CommonTTI(TM, F), IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())) {} bool hasBranchDivergence() { return true; } @@ -118,6 +137,12 @@ public: unsigned getNumberOfRegisters(bool Vector) const; unsigned getRegisterBitWidth(bool Vector) const; unsigned getMinVectorRegisterBitWidth() const; + unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, + unsigned ChainSizeInBytes, + VectorType *VecTy) const; + unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, + unsigned ChainSizeInBytes, + VectorType *VecTy) const; unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, @@ -166,6 +191,53 @@ public: const Function *Callee) const; unsigned getInliningThresholdMultiplier() { return 9; } + + int getArithmeticReductionCost(unsigned Opcode, + Type *Ty, + bool IsPairwise); + int getMinMaxReductionCost(Type *Ty, Type *CondTy, + bool IsPairwiseForm, + bool IsUnsigned); +}; + +class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> { + using BaseT = BasicTTIImplBase<R600TTIImpl>; + using TTI = TargetTransformInfo; + + friend BaseT; + + const R600Subtarget *ST; + const AMDGPUTargetLowering *TLI; + AMDGPUTTIImpl CommonTTI; + +public: + explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F) + : BaseT(TM, F.getParent()->getDataLayout()), + ST(static_cast<const R600Subtarget*>(TM->getSubtargetImpl(F))), + TLI(ST->getTargetLowering()), + CommonTTI(TM, F) {} + + const R600Subtarget *getST() const { return ST; } + const AMDGPUTargetLowering *getTLI() const { return TLI; } + + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP); + unsigned getHardwareNumberOfRegisters(bool Vec) const; + unsigned getNumberOfRegisters(bool Vec) const; + unsigned getRegisterBitWidth(bool Vector) const; + unsigned getMinVectorRegisterBitWidth() const; + unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; + bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment, + unsigned AddrSpace) const; + bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const; + bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const; + unsigned getMaxInterleaveFactor(unsigned VF); + unsigned getCFInstrCost(unsigned Opcode); + int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 6107f3a7dd18..0d3a1673696a 100644 --- a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -28,6 +28,7 @@ #include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" @@ -39,7 +40,7 @@ #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils.h" using namespace llvm; @@ -144,7 +145,8 @@ static BasicBlock *unifyReturnBlockSet(Function &F, if (PN) PN->addIncoming(BB->getTerminator()->getOperand(0), BB); - BB->getInstList().pop_back(); // Remove the return insn + // Remove and delete the return inst. + BB->getTerminator()->eraseFromParent(); BranchInst::Create(NewRetBlock, BB); } @@ -168,6 +170,9 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { SmallVector<BasicBlock *, 4> ReturningBlocks; SmallVector<BasicBlock *, 4> UnreachableBlocks; + // Dummy return block for infinite loop. + BasicBlock *DummyReturnBB = nullptr; + for (BasicBlock *BB : PDT.getRoots()) { if (isa<ReturnInst>(BB->getTerminator())) { if (!isUniformlyReached(DA, *BB)) @@ -175,6 +180,35 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { } else if (isa<UnreachableInst>(BB->getTerminator())) { if (!isUniformlyReached(DA, *BB)) UnreachableBlocks.push_back(BB); + } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { + + ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext()); + if (DummyReturnBB == nullptr) { + DummyReturnBB = BasicBlock::Create(F.getContext(), + "DummyReturnBlock", &F); + Type *RetTy = F.getReturnType(); + Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy); + ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB); + ReturningBlocks.push_back(DummyReturnBB); + } + + if (BI->isUnconditional()) { + BasicBlock *LoopHeaderBB = BI->getSuccessor(0); + BI->eraseFromParent(); // Delete the unconditional branch. + // Add a new conditional branch with a dummy edge to the return block. + BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB); + } else { // Conditional branch. + // Create a new transition block to hold the conditional branch. + BasicBlock *TransitionBB = BasicBlock::Create(F.getContext(), + "TransitionBlock", &F); + + // Move BI from BB to the new transition block. + BI->removeFromParent(); + TransitionBB->getInstList().push_back(BI); + + // Create a branch that will always branch to the transition block. + BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB); + } } } @@ -189,7 +223,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { new UnreachableInst(F.getContext(), UnreachableBlock); for (BasicBlock *BB : UnreachableBlocks) { - BB->getInstList().pop_back(); // Remove the unreachable inst. + // Remove and delete the unreachable inst. + BB->getTerminator()->eraseFromParent(); BranchInst::Create(UnreachableBlock, BB); } } @@ -200,7 +235,8 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { Type *RetTy = F.getReturnType(); Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy); - UnreachableBlock->getInstList().pop_back(); // Remove the unreachable inst. + // Remove and delete the unreachable inst. + UnreachableBlock->getTerminator()->eraseFromParent(); Function *UnreachableIntrin = Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable); diff --git a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp index b78568e89cfb..1f6d9234c1ed 100644 --- a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp +++ b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // // \file -// \brief This pass that unifies multiple OpenCL metadata due to linking. +// This pass that unifies multiple OpenCL metadata due to linking. // //===----------------------------------------------------------------------===// @@ -37,7 +37,7 @@ namespace { } // end namespace kOCLMD - /// \brief Unify multiple OpenCL metadata due to linking. + /// Unify multiple OpenCL metadata due to linking. class AMDGPUUnifyMetadata : public ModulePass { public: static char ID; @@ -47,7 +47,7 @@ namespace { private: bool runOnModule(Module &M) override; - /// \brief Unify version metadata. + /// Unify version metadata. /// \return true if changes are made. /// Assume the named metadata has operands each of which is a pair of /// integer constant, e.g. @@ -82,7 +82,7 @@ namespace { return true; } - /// \brief Unify version metadata. + /// Unify version metadata. /// \return true if changes are made. /// Assume the named metadata has operands each of which is a list e.g. /// !Name = {!n1, !n2} diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index 0a0e43123ae0..11cd49e5b3dc 100644 --- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -11,6 +11,7 @@ #include "AMDGPUSubtarget.h" #include "R600InstrInfo.h" #include "R600RegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SmallPtrSet.h" @@ -28,12 +29,12 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineValueType.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MachineValueType.h" #include "llvm/Support/raw_ostream.h" #include <cassert> #include <cstddef> @@ -78,23 +79,18 @@ namespace { // //===----------------------------------------------------------------------===// -#define SHOWNEWINSTR(i) \ - DEBUG(dbgs() << "New instr: " << *i << "\n"); +#define SHOWNEWINSTR(i) LLVM_DEBUG(dbgs() << "New instr: " << *i << "\n"); -#define SHOWNEWBLK(b, msg) \ -DEBUG( \ - dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ - dbgs() << "\n"; \ -); +#define SHOWNEWBLK(b, msg) \ + LLVM_DEBUG(dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ + dbgs() << "\n";); -#define SHOWBLK_DETAIL(b, msg) \ -DEBUG( \ - if (b) { \ - dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ - b->print(dbgs()); \ - dbgs() << "\n"; \ - } \ -); +#define SHOWBLK_DETAIL(b, msg) \ + LLVM_DEBUG(if (b) { \ + dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ + b->print(dbgs()); \ + dbgs() << "\n"; \ + }); #define INVALIDSCCNUM -1 @@ -158,19 +154,19 @@ public: bool runOnMachineFunction(MachineFunction &MF) override { TII = MF.getSubtarget<R600Subtarget>().getInstrInfo(); TRI = &TII->getRegisterInfo(); - DEBUG(MF.dump();); + LLVM_DEBUG(MF.dump();); OrderedBlks.clear(); Visited.clear(); FuncRep = &MF; MLI = &getAnalysis<MachineLoopInfo>(); - DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI);); + LLVM_DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI);); MDT = &getAnalysis<MachineDominatorTree>(); - DEBUG(MDT->print(dbgs(), (const Module*)nullptr);); + LLVM_DEBUG(MDT->print(dbgs(), (const Module *)nullptr);); PDT = &getAnalysis<MachinePostDominatorTree>(); - DEBUG(PDT->print(dbgs());); + LLVM_DEBUG(PDT->print(dbgs());); prepare(); run(); - DEBUG(MF.dump();); + LLVM_DEBUG(MF.dump();); return true; } @@ -436,19 +432,19 @@ void AMDGPUCFGStructurizer::reversePredicateSetter( for (;; --I) { if (I == MBB.end()) continue; - if (I->getOpcode() == AMDGPU::PRED_X) { + if (I->getOpcode() == R600::PRED_X) { switch (I->getOperand(2).getImm()) { - case AMDGPU::PRED_SETE_INT: - I->getOperand(2).setImm(AMDGPU::PRED_SETNE_INT); + case R600::PRED_SETE_INT: + I->getOperand(2).setImm(R600::PRED_SETNE_INT); return; - case AMDGPU::PRED_SETNE_INT: - I->getOperand(2).setImm(AMDGPU::PRED_SETE_INT); + case R600::PRED_SETNE_INT: + I->getOperand(2).setImm(R600::PRED_SETE_INT); return; - case AMDGPU::PRED_SETE: - I->getOperand(2).setImm(AMDGPU::PRED_SETNE); + case R600::PRED_SETE: + I->getOperand(2).setImm(R600::PRED_SETNE); return; - case AMDGPU::PRED_SETNE: - I->getOperand(2).setImm(AMDGPU::PRED_SETE); + case R600::PRED_SETNE: + I->getOperand(2).setImm(R600::PRED_SETE); return; default: llvm_unreachable("PRED_X Opcode invalid!"); @@ -517,10 +513,10 @@ void AMDGPUCFGStructurizer::insertCondBranchBefore( int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) { switch(OldOpcode) { - case AMDGPU::JUMP_COND: - case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; - case AMDGPU::BRANCH_COND_i32: - case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32; + case R600::JUMP_COND: + case R600::JUMP: return R600::IF_PREDICATE_SET; + case R600::BRANCH_COND_i32: + case R600::BRANCH_COND_f32: return R600::IF_LOGICALNZ_f32; default: llvm_unreachable("internal error"); } return -1; @@ -528,10 +524,10 @@ int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) { int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) { switch(OldOpcode) { - case AMDGPU::JUMP_COND: - case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; - case AMDGPU::BRANCH_COND_i32: - case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32; + case R600::JUMP_COND: + case R600::JUMP: return R600::IF_PREDICATE_SET; + case R600::BRANCH_COND_i32: + case R600::BRANCH_COND_f32: return R600::IF_LOGICALZ_f32; default: llvm_unreachable("internal error"); } return -1; @@ -539,8 +535,8 @@ int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) { int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) { switch(OldOpcode) { - case AMDGPU::JUMP_COND: - case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32; + case R600::JUMP_COND: + case R600::JUMP: return R600::CONTINUE_LOGICALNZ_i32; default: llvm_unreachable("internal error"); } return -1; @@ -548,8 +544,8 @@ int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) { int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) { switch(OldOpcode) { - case AMDGPU::JUMP_COND: - case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32; + case R600::JUMP_COND: + case R600::JUMP: return R600::CONTINUE_LOGICALZ_i32; default: llvm_unreachable("internal error"); } return -1; @@ -577,9 +573,9 @@ AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB, bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) { switch (MI->getOpcode()) { - case AMDGPU::JUMP_COND: - case AMDGPU::BRANCH_COND_i32: - case AMDGPU::BRANCH_COND_f32: return true; + case R600::JUMP_COND: + case R600::BRANCH_COND_i32: + case R600::BRANCH_COND_f32: return true; default: return false; } @@ -588,8 +584,8 @@ bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) { bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) { switch (MI->getOpcode()) { - case AMDGPU::JUMP: - case AMDGPU::BRANCH: + case R600::JUMP: + case R600::BRANCH: return true; default: return false; @@ -638,7 +634,7 @@ MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) { MachineBasicBlock::reverse_iterator It = MBB->rbegin(); if (It != MBB->rend()) { MachineInstr *instr = &(*It); - if (instr->getOpcode() == AMDGPU::RETURN) + if (instr->getOpcode() == R600::RETURN) return instr; } return nullptr; @@ -650,9 +646,8 @@ bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) { if (MI) assert(IsReturn); else if (IsReturn) - DEBUG( - dbgs() << "BB" << MBB->getNumber() - <<" is return block without RETURN instr\n";); + LLVM_DEBUG(dbgs() << "BB" << MBB->getNumber() + << " is return block without RETURN instr\n";); return IsReturn; } @@ -692,8 +687,8 @@ void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) { MachineBasicBlock::iterator E = MBB->end(); MachineBasicBlock::iterator It = Pre; while (It != E) { - if (Pre->getOpcode() == AMDGPU::CONTINUE - && It->getOpcode() == AMDGPU::ENDLOOP) + if (Pre->getOpcode() == R600::CONTINUE + && It->getOpcode() == R600::ENDLOOP) ContInstr.push_back(&*Pre); Pre = It; ++It; @@ -714,7 +709,7 @@ bool AMDGPUCFGStructurizer::prepare() { //FIXME: if not reducible flow graph, make it so ??? - DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";); + LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";); orderBlocks(FuncRep); @@ -757,14 +752,14 @@ bool AMDGPUCFGStructurizer::prepare() { bool AMDGPUCFGStructurizer::run() { //Assume reducible CFG... - DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n"); + LLVM_DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n"); #ifdef STRESSTEST //Use the worse block ordering to test the algorithm. ReverseVector(orderedBlks); #endif - DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks();); + LLVM_DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks();); int NumIter = 0; bool Finish = false; MachineBasicBlock *MBB; @@ -774,10 +769,8 @@ bool AMDGPUCFGStructurizer::run() { do { ++NumIter; - DEBUG( - dbgs() << "numIter = " << NumIter - << ", numRemaintedBlk = " << NumRemainedBlk << "\n"; - ); + LLVM_DEBUG(dbgs() << "numIter = " << NumIter + << ", numRemaintedBlk = " << NumRemainedBlk << "\n";); SmallVectorImpl<MachineBasicBlock *>::const_iterator It = OrderedBlks.begin(); @@ -799,10 +792,8 @@ bool AMDGPUCFGStructurizer::run() { SccBeginMBB = MBB; SccNumIter = 0; SccNumBlk = NumRemainedBlk; // Init to maximum possible number. - DEBUG( - dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB); - dbgs() << "\n"; - ); + LLVM_DEBUG(dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB); + dbgs() << "\n";); } if (!isRetiredBlock(MBB)) @@ -817,20 +808,16 @@ bool AMDGPUCFGStructurizer::run() { ++SccNumIter; int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It); if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) { - DEBUG( - dbgs() << "Can't reduce SCC " << getSCCNum(MBB) - << ", sccNumIter = " << SccNumIter; - dbgs() << "doesn't make any progress\n"; - ); + LLVM_DEBUG(dbgs() << "Can't reduce SCC " << getSCCNum(MBB) + << ", sccNumIter = " << SccNumIter; + dbgs() << "doesn't make any progress\n";); ContNextScc = true; } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) { SccNumBlk = sccRemainedNumBlk; It = SccBeginIter; ContNextScc = false; - DEBUG( - dbgs() << "repeat processing SCC" << getSCCNum(MBB) - << "sccNumIter = " << SccNumIter << '\n'; - ); + LLVM_DEBUG(dbgs() << "repeat processing SCC" << getSCCNum(MBB) + << "sccNumIter = " << SccNumIter << '\n';); } else { // Finish the current scc. ContNextScc = true; @@ -848,9 +835,7 @@ bool AMDGPUCFGStructurizer::run() { *GraphTraits<MachineFunction *>::nodes_begin(FuncRep); if (EntryMBB->succ_size() == 0) { Finish = true; - DEBUG( - dbgs() << "Reduce to one block\n"; - ); + LLVM_DEBUG(dbgs() << "Reduce to one block\n";); } else { int NewnumRemainedBlk = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end()); @@ -860,9 +845,7 @@ bool AMDGPUCFGStructurizer::run() { NumRemainedBlk = NewnumRemainedBlk; } else { MakeProgress = false; - DEBUG( - dbgs() << "No progress\n"; - ); + LLVM_DEBUG(dbgs() << "No progress\n";); } } } while (!Finish && MakeProgress); @@ -875,9 +858,7 @@ bool AMDGPUCFGStructurizer::run() { It != E; ++It) { if ((*It).second && (*It).second->IsRetired) { assert(((*It).first)->getNumber() != -1); - DEBUG( - dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n"; - ); + LLVM_DEBUG(dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";); (*It).first->eraseFromParent(); //Remove from the parent Function. } delete (*It).second; @@ -886,7 +867,7 @@ bool AMDGPUCFGStructurizer::run() { LLInfoMap.clear(); if (!Finish) { - DEBUG(FuncRep->viewCFG()); + LLVM_DEBUG(FuncRep->viewCFG()); report_fatal_error("IRREDUCIBLE_CFG"); } @@ -920,17 +901,13 @@ int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) { int NumMatch = 0; int CurMatch; - DEBUG( - dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n"; - ); + LLVM_DEBUG(dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";); while ((CurMatch = patternMatchGroup(MBB)) > 0) NumMatch += CurMatch; - DEBUG( - dbgs() << "End patternMatch BB" << MBB->getNumber() - << ", numMatch = " << NumMatch << "\n"; - ); + LLVM_DEBUG(dbgs() << "End patternMatch BB" << MBB->getNumber() + << ", numMatch = " << NumMatch << "\n";); return NumMatch; } @@ -1050,7 +1027,7 @@ int AMDGPUCFGStructurizer::loopendPatternMatch() { for (MachineLoop *ExaminedLoop : NestedLoops) { if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop]) continue; - DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump();); + LLVM_DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump();); int NumBreak = mergeLoop(ExaminedLoop); if (NumBreak == -1) break; @@ -1064,7 +1041,8 @@ int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) { MBBVector ExitingMBBs; LoopRep->getExitingBlocks(ExitingMBBs); assert(!ExitingMBBs.empty() && "Infinite Loop not supported"); - DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";); + LLVM_DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() + << " exiting blocks\n";); // We assume a single ExitBlk MBBVector ExitBlks; LoopRep->getExitBlocks(ExitBlks); @@ -1106,11 +1084,9 @@ bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak( if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) { MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep]; if (TheEntry) { - DEBUG( - dbgs() << "isLoopContBreakBlock yes src1 = BB" - << Src1MBB->getNumber() - << " src2 = BB" << Src2MBB->getNumber() << "\n"; - ); + LLVM_DEBUG(dbgs() << "isLoopContBreakBlock yes src1 = BB" + << Src1MBB->getNumber() << " src2 = BB" + << Src2MBB->getNumber() << "\n";); return true; } } @@ -1122,9 +1098,8 @@ int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) { int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB); if (Num == 0) { - DEBUG( - dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n"; - ); + LLVM_DEBUG(dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" + << "\n";); Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB); } return Num; @@ -1138,22 +1113,16 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB, //trueBlk could be the common post dominator DownBlk = TrueMBB; - DEBUG( - dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber() - << " true = BB" << TrueMBB->getNumber() - << ", numSucc=" << TrueMBB->succ_size() - << " false = BB" << FalseMBB->getNumber() << "\n"; - ); + LLVM_DEBUG(dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber() + << " true = BB" << TrueMBB->getNumber() + << ", numSucc=" << TrueMBB->succ_size() << " false = BB" + << FalseMBB->getNumber() << "\n";); while (DownBlk) { - DEBUG( - dbgs() << "check down = BB" << DownBlk->getNumber(); - ); + LLVM_DEBUG(dbgs() << "check down = BB" << DownBlk->getNumber();); if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) { - DEBUG( - dbgs() << " working\n"; - ); + LLVM_DEBUG(dbgs() << " working\n";); Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk); Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk); @@ -1166,9 +1135,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB, break; } - DEBUG( - dbgs() << " not working\n"; - ); + LLVM_DEBUG(dbgs() << " not working\n";); DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr; } // walk down the postDomTree @@ -1247,10 +1214,9 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1) MigrateFalse = true; - DEBUG( - dbgs() << "before improveSimpleJumpintoIf: "; - showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0); - ); + LLVM_DEBUG( + dbgs() << "before improveSimpleJumpintoIf: "; + showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);); // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk // @@ -1337,15 +1303,15 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2); - //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL" - MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF); + //insert R600::ENDIF to avoid special case "input landBlk == NULL" + MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, R600::ENDIF); if (LandBlkHasOtherPred) { report_fatal_error("Extra register needed to handle CFG"); unsigned CmpResReg = HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); report_fatal_error("Extra compare instruction needed to handle CFG"); - insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, + insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET, CmpResReg, DebugLoc()); } @@ -1353,7 +1319,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, // cause an assertion failure in the PostRA scheduling pass. unsigned InitReg = HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); - insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg, + insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET, InitReg, DebugLoc()); if (MigrateTrue) { @@ -1363,7 +1329,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, // (initVal != 1). report_fatal_error("Extra register needed to handle CFG"); } - insertInstrBefore(I, AMDGPU::ELSE); + insertInstrBefore(I, R600::ELSE); if (MigrateFalse) { migrateInstruction(FalseMBB, LandBlk, I); @@ -1375,7 +1341,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, if (LandBlkHasOtherPred) { // add endif - insertInstrBefore(I, AMDGPU::ENDIF); + insertInstrBefore(I, R600::ENDIF); // put initReg = 2 to other predecessors of landBlk for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(), @@ -1385,10 +1351,9 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, report_fatal_error("Extra register needed to handle CFG"); } } - DEBUG( - dbgs() << "result from improveSimpleJumpintoIf: "; - showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0); - ); + LLVM_DEBUG( + dbgs() << "result from improveSimpleJumpintoIf: "; + showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);); // update landBlk *LandMBBPtr = LandBlk; @@ -1398,10 +1363,8 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB, MachineBasicBlock *SrcMBB) { - DEBUG( - dbgs() << "serialPattern BB" << DstMBB->getNumber() - << " <= BB" << SrcMBB->getNumber() << "\n"; - ); + LLVM_DEBUG(dbgs() << "serialPattern BB" << DstMBB->getNumber() << " <= BB" + << SrcMBB->getNumber() << "\n";); DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end()); DstMBB->removeSuccessor(SrcMBB, true); @@ -1416,26 +1379,15 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) { assert (TrueMBB); - DEBUG( - dbgs() << "ifPattern BB" << MBB->getNumber(); - dbgs() << "{ "; - if (TrueMBB) { - dbgs() << "BB" << TrueMBB->getNumber(); - } - dbgs() << " } else "; - dbgs() << "{ "; - if (FalseMBB) { - dbgs() << "BB" << FalseMBB->getNumber(); - } - dbgs() << " }\n "; - dbgs() << "landBlock: "; - if (!LandMBB) { - dbgs() << "NULL"; - } else { - dbgs() << "BB" << LandMBB->getNumber(); - } - dbgs() << "\n"; - ); + LLVM_DEBUG(dbgs() << "ifPattern BB" << MBB->getNumber(); dbgs() << "{ "; + if (TrueMBB) { dbgs() << "BB" << TrueMBB->getNumber(); } dbgs() + << " } else "; + dbgs() << "{ "; if (FalseMBB) { + dbgs() << "BB" << FalseMBB->getNumber(); + } dbgs() << " }\n "; + dbgs() << "landBlock: "; if (!LandMBB) { dbgs() << "NULL"; } else { + dbgs() << "BB" << LandMBB->getNumber(); + } dbgs() << "\n";); int OldOpcode = BranchMI->getOpcode(); DebugLoc BranchDL = BranchMI->getDebugLoc(); @@ -1462,7 +1414,7 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, } if (FalseMBB) { - insertInstrBefore(I, AMDGPU::ELSE); + insertInstrBefore(I, R600::ELSE); MBB->splice(I, FalseMBB, FalseMBB->begin(), FalseMBB->end()); MBB->removeSuccessor(FalseMBB, true); @@ -1471,7 +1423,7 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, retireBlock(FalseMBB); MLI->removeBlock(FalseMBB); } - insertInstrBefore(I, AMDGPU::ENDIF); + insertInstrBefore(I, R600::ENDIF); BranchMI->eraseFromParent(); @@ -1481,18 +1433,19 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, MachineBasicBlock *LandMBB) { - DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber() - << " land = BB" << LandMBB->getNumber() << "\n";); + LLVM_DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber() + << " land = BB" << LandMBB->getNumber() << "\n";); - insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc()); - insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc()); + insertInstrBefore(DstBlk, R600::WHILELOOP, DebugLoc()); + insertInstrEnd(DstBlk, R600::ENDLOOP, DebugLoc()); DstBlk->replaceSuccessor(DstBlk, LandMBB); } void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, MachineBasicBlock *LandMBB) { - DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber() - << " land = BB" << LandMBB->getNumber() << "\n";); + LLVM_DEBUG(dbgs() << "loopbreakPattern exiting = BB" + << ExitingMBB->getNumber() << " land = BB" + << LandMBB->getNumber() << "\n";); MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB); assert(BranchMI && isCondBranch(BranchMI)); DebugLoc DL = BranchMI->getDebugLoc(); @@ -1500,9 +1453,9 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, MachineBasicBlock::iterator I = BranchMI; if (TrueBranch != LandMBB) reversePredicateSetter(I, *I->getParent()); - insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL); - insertInstrBefore(I, AMDGPU::BREAK); - insertInstrBefore(I, AMDGPU::ENDIF); + insertCondBranchBefore(ExitingMBB, I, R600::IF_PREDICATE_SET, R600::PREDICATE_BIT, DL); + insertInstrBefore(I, R600::BREAK); + insertInstrBefore(I, R600::ENDIF); //now branchInst can be erase safely BranchMI->eraseFromParent(); //now take care of successors, retire blocks @@ -1511,9 +1464,9 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB, MachineBasicBlock *ContMBB) { - DEBUG(dbgs() << "settleLoopcontBlock conting = BB" - << ContingMBB->getNumber() - << ", cont = BB" << ContMBB->getNumber() << "\n";); + LLVM_DEBUG(dbgs() << "settleLoopcontBlock conting = BB" + << ContingMBB->getNumber() << ", cont = BB" + << ContMBB->getNumber() << "\n";); MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB); if (MI) { @@ -1531,8 +1484,8 @@ void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB, getBranchZeroOpcode(OldOpcode); insertCondBranchBefore(I, BranchOpcode, DL); // insertEnd to ensure phi-moves, if exist, go before the continue-instr. - insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL); - insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL); + insertInstrEnd(ContingMBB, R600::CONTINUE, DL); + insertInstrEnd(ContingMBB, R600::ENDIF, DL); } else { int BranchOpcode = TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) : @@ -1547,7 +1500,7 @@ void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB, // location we've just inserted that reference here so it should be // representative insertEnd to ensure phi-moves, if exist, go before the // continue-instr. - insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, + insertInstrEnd(ContingMBB, R600::CONTINUE, getLastDebugLocInBB(ContingMBB)); } } @@ -1587,10 +1540,9 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB, numClonedInstr += MBB->size(); - DEBUG( - dbgs() << "Cloned block: " << "BB" - << MBB->getNumber() << "size " << MBB->size() << "\n"; - ); + LLVM_DEBUG(dbgs() << "Cloned block: " + << "BB" << MBB->getNumber() << "size " << MBB->size() + << "\n";); SHOWNEWBLK(CloneMBB, "result of Cloned block: "); @@ -1603,26 +1555,22 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB, //look for the input branchinstr, not the AMDGPU branchinstr MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB); if (!BranchMI) { - DEBUG( - dbgs() << "migrateInstruction don't see branch instr\n"; - ); + LLVM_DEBUG(dbgs() << "migrateInstruction don't see branch instr\n";); SpliceEnd = SrcMBB->end(); } else { - DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI); + LLVM_DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI); SpliceEnd = BranchMI; } - DEBUG( - dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size() - << "srcSize = " << SrcMBB->size() << "\n"; - ); + LLVM_DEBUG(dbgs() << "migrateInstruction before splice dstSize = " + << DstMBB->size() << "srcSize = " << SrcMBB->size() + << "\n";); //splice insert before insertPos DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd); - DEBUG( - dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size() - << "srcSize = " << SrcMBB->size() << '\n'; - ); + LLVM_DEBUG(dbgs() << "migrateInstruction after splice dstSize = " + << DstMBB->size() << "srcSize = " << SrcMBB->size() + << '\n';); } MachineBasicBlock * @@ -1640,7 +1588,7 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) { MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock(); FuncRep->push_back(DummyExitBlk); //insert to function SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: "); - DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";); + LLVM_DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";); LLVMContext &Ctx = LoopHeader->getParent()->getFunction().getContext(); Ctx.emitError("Extra register needed to handle CFG"); return nullptr; @@ -1653,7 +1601,7 @@ void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) { // test_fc_do_while_or.c need to fix the upstream on this to remove the loop. while ((BranchMI = getLoopendBlockBranchInstr(MBB)) && isUncondBranch(BranchMI)) { - DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI); + LLVM_DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI); BranchMI->eraseFromParent(); } } @@ -1669,7 +1617,7 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch( MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB); assert(BranchMI && isCondBranch(BranchMI)); - DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI); + LLVM_DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI); BranchMI->eraseFromParent(); SHOWNEWBLK(MBB1, "Removing redundant successor"); MBB->removeSuccessor(MBB1, true); @@ -1679,7 +1627,7 @@ void AMDGPUCFGStructurizer::addDummyExitBlock( SmallVectorImpl<MachineBasicBlock*> &RetMBB) { MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock(); FuncRep->push_back(DummyExitBlk); //insert to function - insertInstrEnd(DummyExitBlk, AMDGPU::RETURN); + insertInstrEnd(DummyExitBlk, R600::RETURN); for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(), E = RetMBB.end(); It != E; ++It) { @@ -1688,10 +1636,8 @@ void AMDGPUCFGStructurizer::addDummyExitBlock( if (MI) MI->eraseFromParent(); MBB->addSuccessor(DummyExitBlk); - DEBUG( - dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber() - << " successors\n"; - ); + LLVM_DEBUG(dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber() + << " successors\n";); } SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: "); } @@ -1710,9 +1656,7 @@ void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB, } void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) { - DEBUG( - dbgs() << "Retiring BB" << MBB->getNumber() << "\n"; - ); + LLVM_DEBUG(dbgs() << "Retiring BB" << MBB->getNumber() << "\n";); BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB]; diff --git a/lib/Target/AMDGPU/AMDKernelCodeT.h b/lib/Target/AMDGPU/AMDKernelCodeT.h index 5d243e949fd3..289642aaa2d0 100644 --- a/lib/Target/AMDGPU/AMDKernelCodeT.h +++ b/lib/Target/AMDGPU/AMDKernelCodeT.h @@ -198,7 +198,7 @@ enum amd_code_property_mask_t { AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED2_SHIFT }; -/// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL +/// The hsa_ext_control_directives_t specifies the values for the HSAIL /// control directives. These control how the finalizer generates code. This /// struct is used both as an argument to hsaFinalizeKernel to specify values for /// the control directives, and is used in HsaKernelCode to record the values of @@ -551,14 +551,8 @@ typedef struct amd_kernel_code_s { int64_t kernel_code_prefetch_byte_offset; uint64_t kernel_code_prefetch_byte_size; - /// Number of bytes of scratch backing memory required for full - /// occupancy of target chip. This takes into account the number of - /// bytes of scratch per work-item, the wavefront size, the maximum - /// number of wavefronts per CU, and the number of CUs. This is an - /// upper limit on scratch. If the grid being dispatched is small it - /// may only need less than this. If the kernel uses no scratch, or - /// the Finalizer has not computed this value, it must be 0. - uint64_t max_scratch_backing_memory_byte_size; + /// Reserved. Must be 0. + uint64_t reserved0; /// Shader program settings for CS. Contains COMPUTE_PGM_RSRC1 and /// COMPUTE_PGM_RSRC2 registers. diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index ebf656c549ec..31e2885c833d 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -12,6 +12,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "MCTargetDesc/AMDGPUTargetStreamer.h" #include "SIDefines.h" +#include "SIInstrInfo.h" #include "Utils/AMDGPUAsmUtils.h" #include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDKernelCodeTUtils.h" @@ -25,7 +26,6 @@ #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/ELF.h" -#include "llvm/CodeGen/MachineValueType.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" @@ -42,9 +42,11 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/AMDGPUMetadata.h" +#include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/SMLoc.h" #include "llvm/Support/TargetRegistry.h" @@ -60,6 +62,7 @@ using namespace llvm; using namespace llvm::AMDGPU; +using namespace llvm::amdhsa; namespace { @@ -128,6 +131,7 @@ public: enum ImmTy { ImmTyNone, ImmTyGDS, + ImmTyLDS, ImmTyOffen, ImmTyIdxen, ImmTyAddr64, @@ -138,6 +142,7 @@ public: ImmTyGLC, ImmTySLC, ImmTyTFE, + ImmTyD16, ImmTyClampSI, ImmTyOModSI, ImmTyDppCtrl, @@ -267,7 +272,11 @@ public: return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID); } - bool isSDWARegKind() const; + bool isSDWAOperand(MVT type) const; + bool isSDWAFP16Operand() const; + bool isSDWAFP32Operand() const; + bool isSDWAInt16Operand() const; + bool isSDWAInt32Operand() const; bool isImmTy(ImmTy ImmT) const { return isImm() && Imm.Type == ImmT; @@ -282,7 +291,7 @@ public: bool isDMask() const { return isImmTy(ImmTyDMask); } bool isUNorm() const { return isImmTy(ImmTyUNorm); } bool isDA() const { return isImmTy(ImmTyDA); } - bool isR128() const { return isImmTy(ImmTyUNorm); } + bool isR128() const { return isImmTy(ImmTyR128); } bool isLWE() const { return isImmTy(ImmTyLWE); } bool isOff() const { return isImmTy(ImmTyOff); } bool isExpTgt() const { return isImmTy(ImmTyExpTgt); } @@ -298,9 +307,11 @@ public: bool isOffsetU12() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isUInt<12>(getImm()); } bool isOffsetS13() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isInt<13>(getImm()); } bool isGDS() const { return isImmTy(ImmTyGDS); } + bool isLDS() const { return isImmTy(ImmTyLDS); } bool isGLC() const { return isImmTy(ImmTyGLC); } bool isSLC() const { return isImmTy(ImmTySLC); } bool isTFE() const { return isImmTy(ImmTyTFE); } + bool isD16() const { return isImmTy(ImmTyD16); } bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); } bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); } bool isBankMask() const { return isImmTy(ImmTyDppBankMask); } @@ -434,7 +445,7 @@ public: } bool isVSrcB32() const { - return isVCSrcF32() || isLiteralImm(MVT::i32); + return isVCSrcF32() || isLiteralImm(MVT::i32) || isExpr(); } bool isVSrcB64() const { @@ -451,7 +462,7 @@ public: } bool isVSrcF32() const { - return isVCSrcF32() || isLiteralImm(MVT::f32); + return isVCSrcF32() || isLiteralImm(MVT::f32) || isExpr(); } bool isVSrcF64() const { @@ -643,6 +654,7 @@ public: switch (Type) { case ImmTyNone: OS << "None"; break; case ImmTyGDS: OS << "GDS"; break; + case ImmTyLDS: OS << "LDS"; break; case ImmTyOffen: OS << "Offen"; break; case ImmTyIdxen: OS << "Idxen"; break; case ImmTyAddr64: OS << "Addr64"; break; @@ -653,6 +665,7 @@ public: case ImmTyGLC: OS << "GLC"; break; case ImmTySLC: OS << "SLC"; break; case ImmTyTFE: OS << "TFE"; break; + case ImmTyD16: OS << "D16"; break; case ImmTyDFMT: OS << "DFMT"; break; case ImmTyNFMT: OS << "NFMT"; break; case ImmTyClampSI: OS << "ClampSI"; break; @@ -815,6 +828,10 @@ public: class AMDGPUAsmParser : public MCTargetAsmParser { MCAsmParser &Parser; + // Number of extra operands parsed after the first optional operand. + // This may be necessary to skip hardcoded mandatory operands. + static const unsigned MAX_OPR_LOOKAHEAD = 8; + unsigned ForcedEncodingSize = 0; bool ForcedDPP = false; bool ForcedSDWA = false; @@ -830,6 +847,27 @@ class AMDGPUAsmParser : public MCTargetAsmParser { private: bool ParseAsAbsoluteExpression(uint32_t &Ret); + bool OutOfRangeError(SMRange Range); + /// Calculate VGPR/SGPR blocks required for given target, reserved + /// registers, and user-specified NextFreeXGPR values. + /// + /// \param Features [in] Target features, used for bug corrections. + /// \param VCCUsed [in] Whether VCC special SGPR is reserved. + /// \param FlatScrUsed [in] Whether FLAT_SCRATCH special SGPR is reserved. + /// \param XNACKUsed [in] Whether XNACK_MASK special SGPR is reserved. + /// \param NextFreeVGPR [in] Max VGPR number referenced, plus one. + /// \param VGPRRange [in] Token range, used for VGPR diagnostics. + /// \param NextFreeSGPR [in] Max SGPR number referenced, plus one. + /// \param SGPRRange [in] Token range, used for SGPR diagnostics. + /// \param VGPRBlocks [out] Result VGPR block count. + /// \param SGPRBlocks [out] Result SGPR block count. + bool calculateGPRBlocks(const FeatureBitset &Features, bool VCCUsed, + bool FlatScrUsed, bool XNACKUsed, + unsigned NextFreeVGPR, SMRange VGPRRange, + unsigned NextFreeSGPR, SMRange SGPRRange, + unsigned &VGPRBlocks, unsigned &SGPRBlocks); + bool ParseDirectiveAMDGCNTarget(); + bool ParseDirectiveAMDHSAKernel(); bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor); bool ParseDirectiveHSACodeObjectVersion(); bool ParseDirectiveHSACodeObjectISA(); @@ -848,8 +886,12 @@ private: bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex); + Optional<StringRef> getGprCountSymbolName(RegisterKind RegKind); + void initializeGprCountSymbol(RegisterKind RegKind); + bool updateGprCountSymbols(RegisterKind RegKind, unsigned DwordRegIndex, + unsigned RegWidth); void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, - bool IsAtomic, bool IsAtomicReturn); + bool IsAtomic, bool IsAtomicReturn, bool IsLds = false); void cvtDSImpl(MCInst &Inst, const OperandVector &Operands, bool IsGdsHardcoded); @@ -881,15 +923,37 @@ public: AMDGPU::IsaInfo::IsaVersion ISA = AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()); MCContext &Ctx = getContext(); - MCSymbol *Sym = - Ctx.getOrCreateSymbol(Twine(".option.machine_version_major")); - Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx)); - Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor")); - Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx)); - Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping")); - Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx)); + if (ISA.Major >= 6 && AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) { + MCSymbol *Sym = + Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number")); + Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx)); + } else { + MCSymbol *Sym = + Ctx.getOrCreateSymbol(Twine(".option.machine_version_major")); + Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx)); + Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor")); + Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx)); + Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping")); + Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx)); + } + if (ISA.Major >= 6 && AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) { + initializeGprCountSymbol(IS_VGPR); + initializeGprCountSymbol(IS_SGPR); + } else + KernelScope.initialize(getContext()); } - KernelScope.initialize(getContext()); + } + + bool hasXNACK() const { + return AMDGPU::hasXNACK(getSTI()); + } + + bool hasMIMG_R128() const { + return AMDGPU::hasMIMG_R128(getSTI()); + } + + bool hasPackedD16() const { + return AMDGPU::hasPackedD16(getSTI()); } bool isSI() const { @@ -1025,6 +1089,11 @@ private: bool validateConstantBusLimitations(const MCInst &Inst); bool validateEarlyClobberLimitations(const MCInst &Inst); bool validateIntClampSupported(const MCInst &Inst); + bool validateMIMGAtomicDMask(const MCInst &Inst); + bool validateMIMGGatherDMask(const MCInst &Inst); + bool validateMIMGDataSize(const MCInst &Inst); + bool validateMIMGR128(const MCInst &Inst); + bool validateMIMGD16(const MCInst &Inst); bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; @@ -1037,6 +1106,7 @@ private: public: OperandMatchResultTy parseOptionalOperand(OperandVector &Operands); + OperandMatchResultTy parseOptionalOpr(OperandVector &Operands); OperandMatchResultTy parseExpTgt(OperandVector &Operands); OperandMatchResultTy parseSendMsgOp(OperandVector &Operands); @@ -1060,17 +1130,12 @@ public: void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); } void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); } void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); } + void cvtMubufLds(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false, true); } void cvtMtbuf(MCInst &Inst, const OperandVector &Operands); AMDGPUOperand::Ptr defaultGLC() const; AMDGPUOperand::Ptr defaultSLC() const; - AMDGPUOperand::Ptr defaultTFE() const; - AMDGPUOperand::Ptr defaultDMask() const; - AMDGPUOperand::Ptr defaultUNorm() const; - AMDGPUOperand::Ptr defaultDA() const; - AMDGPUOperand::Ptr defaultR128() const; - AMDGPUOperand::Ptr defaultLWE() const; AMDGPUOperand::Ptr defaultSMRDOffset8() const; AMDGPUOperand::Ptr defaultSMRDOffset20() const; AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const; @@ -1276,15 +1341,31 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const { return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg()); } -bool AMDGPUOperand::isSDWARegKind() const { +bool AMDGPUOperand::isSDWAOperand(MVT type) const { if (AsmParser->isVI()) return isVReg(); else if (AsmParser->isGFX9()) - return isRegKind(); + return isRegKind() || isInlinableImm(type); else return false; } +bool AMDGPUOperand::isSDWAFP16Operand() const { + return isSDWAOperand(MVT::f16); +} + +bool AMDGPUOperand::isSDWAFP32Operand() const { + return isSDWAOperand(MVT::f32); +} + +bool AMDGPUOperand::isSDWAInt16Operand() const { + return isSDWAOperand(MVT::i16); +} + +bool AMDGPUOperand::isSDWAInt32Operand() const { + return isSDWAOperand(MVT::i32); +} + uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const { assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers()); @@ -1516,12 +1597,15 @@ static unsigned getSpecialRegForName(StringRef RegName) { .Case("exec", AMDGPU::EXEC) .Case("vcc", AMDGPU::VCC) .Case("flat_scratch", AMDGPU::FLAT_SCR) + .Case("xnack_mask", AMDGPU::XNACK_MASK) .Case("m0", AMDGPU::M0) .Case("scc", AMDGPU::SCC) .Case("tba", AMDGPU::TBA) .Case("tma", AMDGPU::TMA) .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) + .Case("xnack_mask_lo", AMDGPU::XNACK_MASK_LO) + .Case("xnack_mask_hi", AMDGPU::XNACK_MASK_HI) .Case("vcc_lo", AMDGPU::VCC_LO) .Case("vcc_hi", AMDGPU::VCC_HI) .Case("exec_lo", AMDGPU::EXEC_LO) @@ -1559,6 +1643,11 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth, RegWidth = 2; return true; } + if (Reg == AMDGPU::XNACK_MASK_LO && Reg1 == AMDGPU::XNACK_MASK_HI) { + Reg = AMDGPU::XNACK_MASK; + RegWidth = 2; + return true; + } if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) { Reg = AMDGPU::VCC; RegWidth = 2; @@ -1717,6 +1806,54 @@ bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg, return true; } +Optional<StringRef> +AMDGPUAsmParser::getGprCountSymbolName(RegisterKind RegKind) { + switch (RegKind) { + case IS_VGPR: + return StringRef(".amdgcn.next_free_vgpr"); + case IS_SGPR: + return StringRef(".amdgcn.next_free_sgpr"); + default: + return None; + } +} + +void AMDGPUAsmParser::initializeGprCountSymbol(RegisterKind RegKind) { + auto SymbolName = getGprCountSymbolName(RegKind); + assert(SymbolName && "initializing invalid register kind"); + MCSymbol *Sym = getContext().getOrCreateSymbol(*SymbolName); + Sym->setVariableValue(MCConstantExpr::create(0, getContext())); +} + +bool AMDGPUAsmParser::updateGprCountSymbols(RegisterKind RegKind, + unsigned DwordRegIndex, + unsigned RegWidth) { + // Symbols are only defined for GCN targets + if (AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()).Major < 6) + return true; + + auto SymbolName = getGprCountSymbolName(RegKind); + if (!SymbolName) + return true; + MCSymbol *Sym = getContext().getOrCreateSymbol(*SymbolName); + + int64_t NewMax = DwordRegIndex + RegWidth - 1; + int64_t OldCount; + + if (!Sym->isVariable()) + return !Error(getParser().getTok().getLoc(), + ".amdgcn.next_free_{v,s}gpr symbols must be variable"); + if (!Sym->getVariableValue(false)->evaluateAsAbsolute(OldCount)) + return !Error( + getParser().getTok().getLoc(), + ".amdgcn.next_free_{v,s}gpr symbols must be absolute expressions"); + + if (OldCount <= NewMax) + Sym->setVariableValue(MCConstantExpr::create(NewMax + 1, getContext())); + + return true; +} + std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() { const auto &Tok = Parser.getTok(); SMLoc StartLoc = Tok.getLoc(); @@ -1727,7 +1864,11 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() { if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, &DwordRegIndex)) { return nullptr; } - KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth); + if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) { + if (!updateGprCountSymbols(RegKind, DwordRegIndex, RegWidth)) + return nullptr; + } else + KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth); return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc, false); } @@ -2234,6 +2375,111 @@ bool AMDGPUAsmParser::validateIntClampSupported(const MCInst &Inst) { return true; } +bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst) { + + const unsigned Opc = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opc); + + if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0) + return true; + + int VDataIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); + int DMaskIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dmask); + int TFEIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::tfe); + + assert(VDataIdx != -1); + assert(DMaskIdx != -1); + assert(TFEIdx != -1); + + unsigned VDataSize = AMDGPU::getRegOperandSize(getMRI(), Desc, VDataIdx); + unsigned TFESize = Inst.getOperand(TFEIdx).getImm()? 1 : 0; + unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf; + if (DMask == 0) + DMask = 1; + + unsigned DataSize = + (Desc.TSFlags & SIInstrFlags::Gather4) ? 4 : countPopulation(DMask); + if (hasPackedD16()) { + int D16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::d16); + if (D16Idx >= 0 && Inst.getOperand(D16Idx).getImm()) + DataSize = (DataSize + 1) / 2; + } + + return (VDataSize / 4) == DataSize + TFESize; +} + +bool AMDGPUAsmParser::validateMIMGAtomicDMask(const MCInst &Inst) { + + const unsigned Opc = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opc); + + if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0) + return true; + if (!Desc.mayLoad() || !Desc.mayStore()) + return true; // Not atomic + + int DMaskIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dmask); + unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf; + + // This is an incomplete check because image_atomic_cmpswap + // may only use 0x3 and 0xf while other atomic operations + // may use 0x1 and 0x3. However these limitations are + // verified when we check that dmask matches dst size. + return DMask == 0x1 || DMask == 0x3 || DMask == 0xf; +} + +bool AMDGPUAsmParser::validateMIMGGatherDMask(const MCInst &Inst) { + + const unsigned Opc = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opc); + + if ((Desc.TSFlags & SIInstrFlags::Gather4) == 0) + return true; + + int DMaskIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dmask); + unsigned DMask = Inst.getOperand(DMaskIdx).getImm() & 0xf; + + // GATHER4 instructions use dmask in a different fashion compared to + // other MIMG instructions. The only useful DMASK values are + // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns + // (red,red,red,red) etc.) The ISA document doesn't mention + // this. + return DMask == 0x1 || DMask == 0x2 || DMask == 0x4 || DMask == 0x8; +} + +bool AMDGPUAsmParser::validateMIMGR128(const MCInst &Inst) { + + const unsigned Opc = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opc); + + if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0) + return true; + + int Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::r128); + assert(Idx != -1); + + bool R128 = (Inst.getOperand(Idx).getImm() != 0); + + return !R128 || hasMIMG_R128(); +} + +bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) { + + const unsigned Opc = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opc); + + if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0) + return true; + + int D16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::d16); + if (D16Idx >= 0 && Inst.getOperand(D16Idx).getImm()) { + if (isCI() || isSI()) + return false; + } + + return true; +} + bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, const SMLoc &IDLoc) { if (!validateConstantBusLimitations(Inst)) { @@ -2251,6 +2497,32 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, "integer clamping is not supported on this GPU"); return false; } + if (!validateMIMGR128(Inst)) { + Error(IDLoc, + "r128 modifier is not supported on this GPU"); + return false; + } + // For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate. + if (!validateMIMGD16(Inst)) { + Error(IDLoc, + "d16 modifier is not supported on this GPU"); + return false; + } + if (!validateMIMGDataSize(Inst)) { + Error(IDLoc, + "image data size does not match dmask and tfe"); + return false; + } + if (!validateMIMGAtomicDMask(Inst)) { + Error(IDLoc, + "invalid atomic image dmask"); + return false; + } + if (!validateMIMGGatherDMask(Inst)) { + Error(IDLoc, + "invalid image_gather dmask: only one bit must be set"); + return false; + } return true; } @@ -2355,6 +2627,320 @@ bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major, return false; } +bool AMDGPUAsmParser::ParseDirectiveAMDGCNTarget() { + if (getSTI().getTargetTriple().getArch() != Triple::amdgcn) + return TokError("directive only supported for amdgcn architecture"); + + std::string Target; + + SMLoc TargetStart = getTok().getLoc(); + if (getParser().parseEscapedString(Target)) + return true; + SMRange TargetRange = SMRange(TargetStart, getTok().getLoc()); + + std::string ExpectedTarget; + raw_string_ostream ExpectedTargetOS(ExpectedTarget); + IsaInfo::streamIsaVersion(&getSTI(), ExpectedTargetOS); + + if (Target != ExpectedTargetOS.str()) + return getParser().Error(TargetRange.Start, "target must match options", + TargetRange); + + getTargetStreamer().EmitDirectiveAMDGCNTarget(Target); + return false; +} + +bool AMDGPUAsmParser::OutOfRangeError(SMRange Range) { + return getParser().Error(Range.Start, "value out of range", Range); +} + +bool AMDGPUAsmParser::calculateGPRBlocks( + const FeatureBitset &Features, bool VCCUsed, bool FlatScrUsed, + bool XNACKUsed, unsigned NextFreeVGPR, SMRange VGPRRange, + unsigned NextFreeSGPR, SMRange SGPRRange, unsigned &VGPRBlocks, + unsigned &SGPRBlocks) { + // TODO(scott.linder): These calculations are duplicated from + // AMDGPUAsmPrinter::getSIProgramInfo and could be unified. + IsaInfo::IsaVersion Version = IsaInfo::getIsaVersion(Features); + + unsigned NumVGPRs = NextFreeVGPR; + unsigned NumSGPRs = NextFreeSGPR; + unsigned MaxAddressableNumSGPRs = IsaInfo::getAddressableNumSGPRs(Features); + + if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) && + NumSGPRs > MaxAddressableNumSGPRs) + return OutOfRangeError(SGPRRange); + + NumSGPRs += + IsaInfo::getNumExtraSGPRs(Features, VCCUsed, FlatScrUsed, XNACKUsed); + + if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) && + NumSGPRs > MaxAddressableNumSGPRs) + return OutOfRangeError(SGPRRange); + + if (Features.test(FeatureSGPRInitBug)) + NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; + + VGPRBlocks = IsaInfo::getNumVGPRBlocks(Features, NumVGPRs); + SGPRBlocks = IsaInfo::getNumSGPRBlocks(Features, NumSGPRs); + + return false; +} + +bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { + if (getSTI().getTargetTriple().getArch() != Triple::amdgcn) + return TokError("directive only supported for amdgcn architecture"); + + if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA) + return TokError("directive only supported for amdhsa OS"); + + StringRef KernelName; + if (getParser().parseIdentifier(KernelName)) + return true; + + kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor(); + + StringSet<> Seen; + + IsaInfo::IsaVersion IVersion = + IsaInfo::getIsaVersion(getSTI().getFeatureBits()); + + SMRange VGPRRange; + uint64_t NextFreeVGPR = 0; + SMRange SGPRRange; + uint64_t NextFreeSGPR = 0; + unsigned UserSGPRCount = 0; + bool ReserveVCC = true; + bool ReserveFlatScr = true; + bool ReserveXNACK = hasXNACK(); + + while (true) { + while (getLexer().is(AsmToken::EndOfStatement)) + Lex(); + + if (getLexer().isNot(AsmToken::Identifier)) + return TokError("expected .amdhsa_ directive or .end_amdhsa_kernel"); + + StringRef ID = getTok().getIdentifier(); + SMRange IDRange = getTok().getLocRange(); + Lex(); + + if (ID == ".end_amdhsa_kernel") + break; + + if (Seen.find(ID) != Seen.end()) + return TokError(".amdhsa_ directives cannot be repeated"); + Seen.insert(ID); + + SMLoc ValStart = getTok().getLoc(); + int64_t IVal; + if (getParser().parseAbsoluteExpression(IVal)) + return true; + SMLoc ValEnd = getTok().getLoc(); + SMRange ValRange = SMRange(ValStart, ValEnd); + + if (IVal < 0) + return OutOfRangeError(ValRange); + + uint64_t Val = IVal; + +#define PARSE_BITS_ENTRY(FIELD, ENTRY, VALUE, RANGE) \ + if (!isUInt<ENTRY##_WIDTH>(VALUE)) \ + return OutOfRangeError(RANGE); \ + AMDHSA_BITS_SET(FIELD, ENTRY, VALUE); + + if (ID == ".amdhsa_group_segment_fixed_size") { + if (!isUInt<sizeof(KD.group_segment_fixed_size) * CHAR_BIT>(Val)) + return OutOfRangeError(ValRange); + KD.group_segment_fixed_size = Val; + } else if (ID == ".amdhsa_private_segment_fixed_size") { + if (!isUInt<sizeof(KD.private_segment_fixed_size) * CHAR_BIT>(Val)) + return OutOfRangeError(ValRange); + KD.private_segment_fixed_size = Val; + } else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") { + PARSE_BITS_ENTRY(KD.kernel_code_properties, + KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, + Val, ValRange); + UserSGPRCount++; + } else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") { + PARSE_BITS_ENTRY(KD.kernel_code_properties, + KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val, + ValRange); + UserSGPRCount++; + } else if (ID == ".amdhsa_user_sgpr_queue_ptr") { + PARSE_BITS_ENTRY(KD.kernel_code_properties, + KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val, + ValRange); + UserSGPRCount++; + } else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") { + PARSE_BITS_ENTRY(KD.kernel_code_properties, + KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR, + Val, ValRange); + UserSGPRCount++; + } else if (ID == ".amdhsa_user_sgpr_dispatch_id") { + PARSE_BITS_ENTRY(KD.kernel_code_properties, + KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val, + ValRange); + UserSGPRCount++; + } else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") { + PARSE_BITS_ENTRY(KD.kernel_code_properties, + KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val, + ValRange); + UserSGPRCount++; + } else if (ID == ".amdhsa_user_sgpr_private_segment_size") { + PARSE_BITS_ENTRY(KD.kernel_code_properties, + KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, + Val, ValRange); + UserSGPRCount++; + } else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") { + PARSE_BITS_ENTRY( + KD.compute_pgm_rsrc2, + COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET, Val, + ValRange); + } else if (ID == ".amdhsa_system_sgpr_workgroup_id_x") { + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2, + COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, Val, + ValRange); + } else if (ID == ".amdhsa_system_sgpr_workgroup_id_y") { + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2, + COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y, Val, + ValRange); + } else if (ID == ".amdhsa_system_sgpr_workgroup_id_z") { + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2, + COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z, Val, + ValRange); + } else if (ID == ".amdhsa_system_sgpr_workgroup_info") { + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2, + COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO, Val, + ValRange); + } else if (ID == ".amdhsa_system_vgpr_workitem_id") { + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2, + COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID, Val, + ValRange); + } else if (ID == ".amdhsa_next_free_vgpr") { + VGPRRange = ValRange; + NextFreeVGPR = Val; + } else if (ID == ".amdhsa_next_free_sgpr") { + SGPRRange = ValRange; + NextFreeSGPR = Val; + } else if (ID == ".amdhsa_reserve_vcc") { + if (!isUInt<1>(Val)) + return OutOfRangeError(ValRange); + ReserveVCC = Val; + } else if (ID == ".amdhsa_reserve_flat_scratch") { + if (IVersion.Major < 7) + return getParser().Error(IDRange.Start, "directive requires gfx7+", + IDRange); + if (!isUInt<1>(Val)) + return OutOfRangeError(ValRange); + ReserveFlatScr = Val; + } else if (ID == ".amdhsa_reserve_xnack_mask") { + if (IVersion.Major < 8) + return getParser().Error(IDRange.Start, "directive requires gfx8+", + IDRange); + if (!isUInt<1>(Val)) + return OutOfRangeError(ValRange); + ReserveXNACK = Val; + } else if (ID == ".amdhsa_float_round_mode_32") { + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, + COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32, Val, ValRange); + } else if (ID == ".amdhsa_float_round_mode_16_64") { + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, + COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64, Val, ValRange); + } else if (ID == ".amdhsa_float_denorm_mode_32") { + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, + COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32, Val, ValRange); + } else if (ID == ".amdhsa_float_denorm_mode_16_64") { + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, + COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, Val, + ValRange); + } else if (ID == ".amdhsa_dx10_clamp") { + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, + COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP, Val, ValRange); + } else if (ID == ".amdhsa_ieee_mode") { + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE, + Val, ValRange); + } else if (ID == ".amdhsa_fp16_overflow") { + if (IVersion.Major < 9) + return getParser().Error(IDRange.Start, "directive requires gfx9+", + IDRange); + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FP16_OVFL, Val, + ValRange); + } else if (ID == ".amdhsa_exception_fp_ieee_invalid_op") { + PARSE_BITS_ENTRY( + KD.compute_pgm_rsrc2, + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION, Val, + ValRange); + } else if (ID == ".amdhsa_exception_fp_denorm_src") { + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2, + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE, + Val, ValRange); + } else if (ID == ".amdhsa_exception_fp_ieee_div_zero") { + PARSE_BITS_ENTRY( + KD.compute_pgm_rsrc2, + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO, Val, + ValRange); + } else if (ID == ".amdhsa_exception_fp_ieee_overflow") { + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2, + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW, + Val, ValRange); + } else if (ID == ".amdhsa_exception_fp_ieee_underflow") { + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2, + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW, + Val, ValRange); + } else if (ID == ".amdhsa_exception_fp_ieee_inexact") { + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2, + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT, + Val, ValRange); + } else if (ID == ".amdhsa_exception_int_div_zero") { + PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2, + COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO, + Val, ValRange); + } else { + return getParser().Error(IDRange.Start, + "unknown .amdhsa_kernel directive", IDRange); + } + +#undef PARSE_BITS_ENTRY + } + + if (Seen.find(".amdhsa_next_free_vgpr") == Seen.end()) + return TokError(".amdhsa_next_free_vgpr directive is required"); + + if (Seen.find(".amdhsa_next_free_sgpr") == Seen.end()) + return TokError(".amdhsa_next_free_sgpr directive is required"); + + unsigned VGPRBlocks; + unsigned SGPRBlocks; + if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr, + ReserveXNACK, NextFreeVGPR, VGPRRange, NextFreeSGPR, + SGPRRange, VGPRBlocks, SGPRBlocks)) + return true; + + if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_WIDTH>( + VGPRBlocks)) + return OutOfRangeError(VGPRRange); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, + COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT, VGPRBlocks); + + if (!isUInt<COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_WIDTH>( + SGPRBlocks)) + return OutOfRangeError(SGPRRange); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, + COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT, + SGPRBlocks); + + if (!isUInt<COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_WIDTH>(UserSGPRCount)) + return TokError("too many user SGPRs enabled"); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT, + UserSGPRCount); + + getTargetStreamer().EmitAmdhsaKernelDescriptor( + getSTI(), KernelName, KD, NextFreeVGPR, NextFreeSGPR, ReserveVCC, + ReserveFlatScr, ReserveXNACK); + return false; +} + bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectVersion() { uint32_t Major; uint32_t Minor; @@ -2421,6 +3007,13 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header) { + // max_scratch_backing_memory_byte_size is deprecated. Ignore it while parsing + // assembly for backwards compatibility. + if (ID == "max_scratch_backing_memory_byte_size") { + Parser.eatToEndOfStatement(); + return false; + } + SmallString<40> ErrStr; raw_svector_ostream Err(ErrStr); if (!parseAmdKernelCodeField(ID, getParser(), Header, Err)) { @@ -2467,7 +3060,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() { getTargetStreamer().EmitAMDGPUSymbolType(KernelName, ELF::STT_AMDGPU_HSA_KERNEL); Lex(); - KernelScope.initialize(getContext()); + if (!AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) + KernelScope.initialize(getContext()); return false; } @@ -2571,20 +3165,28 @@ bool AMDGPUAsmParser::ParseDirectivePALMetadata() { bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getString(); - if (IDVal == ".hsa_code_object_version") - return ParseDirectiveHSACodeObjectVersion(); + if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) { + if (IDVal == ".amdgcn_target") + return ParseDirectiveAMDGCNTarget(); + + if (IDVal == ".amdhsa_kernel") + return ParseDirectiveAMDHSAKernel(); + } else { + if (IDVal == ".hsa_code_object_version") + return ParseDirectiveHSACodeObjectVersion(); - if (IDVal == ".hsa_code_object_isa") - return ParseDirectiveHSACodeObjectISA(); + if (IDVal == ".hsa_code_object_isa") + return ParseDirectiveHSACodeObjectISA(); - if (IDVal == ".amd_kernel_code_t") - return ParseDirectiveAMDKernelCodeT(); + if (IDVal == ".amd_kernel_code_t") + return ParseDirectiveAMDKernelCodeT(); - if (IDVal == ".amdgpu_hsa_kernel") - return ParseDirectiveAMDGPUHsaKernel(); + if (IDVal == ".amdgpu_hsa_kernel") + return ParseDirectiveAMDGPUHsaKernel(); - if (IDVal == ".amd_amdgpu_isa") - return ParseDirectiveISAVersion(); + if (IDVal == ".amd_amdgpu_isa") + return ParseDirectiveISAVersion(); + } if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin) return ParseDirectiveHSAMetadata(); @@ -2612,6 +3214,10 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, case AMDGPU::TMA_LO: case AMDGPU::TMA_HI: return !isGFX9(); + case AMDGPU::XNACK_MASK: + case AMDGPU::XNACK_MASK_LO: + case AMDGPU::XNACK_MASK_HI: + return !isCI() && !isSI() && hasXNACK(); default: break; } @@ -3158,7 +3764,10 @@ bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, HwReg.IsSymbolic = true; HwReg.Id = ID_UNKNOWN_; const StringRef tok = Parser.getTok().getString(); - for (int i = ID_SYMBOLIC_FIRST_; i < ID_SYMBOLIC_LAST_; ++i) { + int Last = ID_SYMBOLIC_LAST_; + if (isSI() || isCI() || isVI()) + Last = ID_SYMBOLIC_FIRST_GFX9_; + for (int i = ID_SYMBOLIC_FIRST_; i < Last; ++i) { if (tok == IdSymbolic[i]) { HwReg.Id = i; break; @@ -3859,7 +4468,7 @@ AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) { } else { // Swizzle "offset" operand is optional. // If it is omitted, try parsing other optional operands. - return parseOptionalOperand(Operands); + return parseOptionalOpr(Operands); } } @@ -3907,13 +4516,13 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSLC() const { return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTySLC); } -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultTFE() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyTFE); -} - void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, - bool IsAtomic, bool IsAtomicReturn) { + bool IsAtomic, + bool IsAtomicReturn, + bool IsLds) { + bool IsLdsOpcode = IsLds; + bool HasLdsModifier = false; OptionalImmIndexMap OptionalIdx; assert(IsAtomicReturn ? IsAtomic : true); @@ -3932,6 +4541,8 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, continue; } + HasLdsModifier = Op.isLDS(); + // Handle tokens like 'offen' which are sometimes hard-coded into the // asm string. There are no MCInst operands for these. if (Op.isToken()) { @@ -3943,6 +4554,21 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, OptionalIdx[Op.getImmTy()] = i; } + // This is a workaround for an llvm quirk which may result in an + // incorrect instruction selection. Lds and non-lds versions of + // MUBUF instructions are identical except that lds versions + // have mandatory 'lds' modifier. However this modifier follows + // optional modifiers and llvm asm matcher regards this 'lds' + // modifier as an optional one. As a result, an lds version + // of opcode may be selected even if it has no 'lds' modifier. + if (IsLdsOpcode && !HasLdsModifier) { + int NoLdsOpcode = AMDGPU::getMUBUFNoLdsInst(Inst.getOpcode()); + if (NoLdsOpcode != -1) { // Got lds version - correct it. + Inst.setOpcode(NoLdsOpcode); + IsLdsOpcode = false; + } + } + // Copy $vdata_in operand and insert as $vdata for MUBUF_Atomic RTN insns. if (IsAtomicReturn) { MCInst::iterator I = Inst.begin(); // $vdata_in is always at the beginning. @@ -3954,7 +4580,10 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); } addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); + + if (!IsLdsOpcode) { // tfe is not legal with lds opcodes + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); + } } void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) { @@ -4009,7 +4638,8 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands, if (IsAtomic) { // Add src, same as dst - ((AMDGPUOperand &)*Operands[I]).addRegOperands(Inst, 1); + assert(Desc.getNumDefs() == 1); + ((AMDGPUOperand &)*Operands[I - 1]).addRegOperands(Inst, 1); } OptionalImmIndexMap OptionalIdx; @@ -4018,9 +4648,8 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands, AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); // Add the register arguments - if (Op.isRegOrImm()) { - Op.addRegOrImmOperands(Inst, 1); - continue; + if (Op.isReg()) { + Op.addRegOperands(Inst, 1); } else if (Op.isImmModifier()) { OptionalIdx[Op.getImmTy()] = I; } else { @@ -4031,37 +4660,18 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyD16); } void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMIMG(Inst, Operands, true); } -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDMask() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDMask); -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultUNorm() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyUNorm); -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDA() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDA); -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultR128() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyR128); -} - -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultLWE() const { - return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyLWE); -} - //===----------------------------------------------------------------------===// // smrd //===----------------------------------------------------------------------===// @@ -4148,6 +4758,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"offset0", AMDGPUOperand::ImmTyOffset0, false, nullptr}, {"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr}, {"gds", AMDGPUOperand::ImmTyGDS, true, nullptr}, + {"lds", AMDGPUOperand::ImmTyLDS, true, nullptr}, {"offset", AMDGPUOperand::ImmTyOffset, false, nullptr}, {"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr}, {"dfmt", AMDGPUOperand::ImmTyDFMT, false, nullptr}, @@ -4155,6 +4766,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr}, {"slc", AMDGPUOperand::ImmTySLC, true, nullptr}, {"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr}, + {"d16", AMDGPUOperand::ImmTyD16, true, nullptr}, {"high", AMDGPUOperand::ImmTyHigh, true, nullptr}, {"clamp", AMDGPUOperand::ImmTyClampSI, true, nullptr}, {"omod", AMDGPUOperand::ImmTyOModSI, false, ConvertOmodMul}, @@ -4162,6 +4774,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"da", AMDGPUOperand::ImmTyDA, true, nullptr}, {"r128", AMDGPUOperand::ImmTyR128, true, nullptr}, {"lwe", AMDGPUOperand::ImmTyLWE, true, nullptr}, + {"d16", AMDGPUOperand::ImmTyD16, true, nullptr}, {"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr}, {"row_mask", AMDGPUOperand::ImmTyDppRowMask, false, nullptr}, {"bank_mask", AMDGPUOperand::ImmTyDppBankMask, false, nullptr}, @@ -4179,6 +4792,39 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { }; OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) { + unsigned size = Operands.size(); + assert(size > 0); + + OperandMatchResultTy res = parseOptionalOpr(Operands); + + // This is a hack to enable hardcoded mandatory operands which follow + // optional operands. + // + // Current design assumes that all operands after the first optional operand + // are also optional. However implementation of some instructions violates + // this rule (see e.g. flat/global atomic which have hardcoded 'glc' operands). + // + // To alleviate this problem, we have to (implicitly) parse extra operands + // to make sure autogenerated parser of custom operands never hit hardcoded + // mandatory operands. + + if (size == 1 || ((AMDGPUOperand &)*Operands[size - 1]).isRegKind()) { + + // We have parsed the first optional operand. + // Parse as many operands as necessary to skip all mandatory operands. + + for (unsigned i = 0; i < MAX_OPR_LOOKAHEAD; ++i) { + if (res != MatchOperand_Success || + getLexer().is(AsmToken::EndOfStatement)) break; + if (getLexer().is(AsmToken::Comma)) Parser.Lex(); + res = parseOptionalOpr(Operands); + } + } + + return res; +} + +OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands) { OperandMatchResultTy res; for (const OptionalOperand &Op : AMDGPUOptionalOperandTable) { // try to parse any optional operand here @@ -4341,12 +4987,14 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); } - // special case v_mac_{f16, f32}: + // Special case v_mac_{f16, f32} and v_fmac_f32 (gfx906): // it has src2 register operand that is tied to dst operand // we don't allow modifiers for this operand in assembler so src2_modifiers - // should be 0 - if (Opc == AMDGPU::V_MAC_F32_e64_si || Opc == AMDGPU::V_MAC_F32_e64_vi || - Opc == AMDGPU::V_MAC_F16_e64_vi) { + // should be 0. + if (Opc == AMDGPU::V_MAC_F32_e64_si || + Opc == AMDGPU::V_MAC_F32_e64_vi || + Opc == AMDGPU::V_MAC_F16_e64_vi || + Opc == AMDGPU::V_FMAC_F32_e64_vi) { auto it = Inst.begin(); std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers)); it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2 @@ -4448,21 +5096,23 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, //===----------------------------------------------------------------------===// bool AMDGPUOperand::isDPPCtrl() const { + using namespace AMDGPU::DPP; + bool result = isImm() && getImmTy() == ImmTyDppCtrl && isUInt<9>(getImm()); if (result) { int64_t Imm = getImm(); - return ((Imm >= 0x000) && (Imm <= 0x0ff)) || - ((Imm >= 0x101) && (Imm <= 0x10f)) || - ((Imm >= 0x111) && (Imm <= 0x11f)) || - ((Imm >= 0x121) && (Imm <= 0x12f)) || - (Imm == 0x130) || - (Imm == 0x134) || - (Imm == 0x138) || - (Imm == 0x13c) || - (Imm == 0x140) || - (Imm == 0x141) || - (Imm == 0x142) || - (Imm == 0x143); + return (Imm >= DppCtrl::QUAD_PERM_FIRST && Imm <= DppCtrl::QUAD_PERM_LAST) || + (Imm >= DppCtrl::ROW_SHL_FIRST && Imm <= DppCtrl::ROW_SHL_LAST) || + (Imm >= DppCtrl::ROW_SHR_FIRST && Imm <= DppCtrl::ROW_SHR_LAST) || + (Imm >= DppCtrl::ROW_ROR_FIRST && Imm <= DppCtrl::ROW_ROR_LAST) || + (Imm == DppCtrl::WAVE_SHL1) || + (Imm == DppCtrl::WAVE_ROL1) || + (Imm == DppCtrl::WAVE_SHR1) || + (Imm == DppCtrl::WAVE_ROR1) || + (Imm == DppCtrl::ROW_MIRROR) || + (Imm == DppCtrl::ROW_HALF_MIRROR) || + (Imm == DppCtrl::BCAST15) || + (Imm == DppCtrl::BCAST31); } return false; } @@ -4481,6 +5131,8 @@ bool AMDGPUOperand::isU16Imm() const { OperandMatchResultTy AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { + using namespace AMDGPU::DPP; + SMLoc S = Parser.getTok().getLoc(); StringRef Prefix; int64_t Int; @@ -4492,10 +5144,10 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { } if (Prefix == "row_mirror") { - Int = 0x140; + Int = DppCtrl::ROW_MIRROR; Parser.Lex(); } else if (Prefix == "row_half_mirror") { - Int = 0x141; + Int = DppCtrl::ROW_HALF_MIRROR; Parser.Lex(); } else { // Check to prevent parseDPPCtrlOps from eating invalid tokens @@ -4547,24 +5199,24 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { return MatchOperand_ParseFail; if (Prefix == "row_shl" && 1 <= Int && Int <= 15) { - Int |= 0x100; + Int |= DppCtrl::ROW_SHL0; } else if (Prefix == "row_shr" && 1 <= Int && Int <= 15) { - Int |= 0x110; + Int |= DppCtrl::ROW_SHR0; } else if (Prefix == "row_ror" && 1 <= Int && Int <= 15) { - Int |= 0x120; + Int |= DppCtrl::ROW_ROR0; } else if (Prefix == "wave_shl" && 1 == Int) { - Int = 0x130; + Int = DppCtrl::WAVE_SHL1; } else if (Prefix == "wave_rol" && 1 == Int) { - Int = 0x134; + Int = DppCtrl::WAVE_ROL1; } else if (Prefix == "wave_shr" && 1 == Int) { - Int = 0x138; + Int = DppCtrl::WAVE_SHR1; } else if (Prefix == "wave_ror" && 1 == Int) { - Int = 0x13C; + Int = DppCtrl::WAVE_ROR1; } else if (Prefix == "row_bcast") { if (Int == 15) { - Int = 0x142; + Int = DppCtrl::BCAST15; } else if (Int == 31) { - Int = 0x143; + Int = DppCtrl::BCAST31; } else { return MatchOperand_ParseFail; } @@ -4742,7 +5394,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, } } if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { - Op.addRegWithInputModsOperands(Inst, 2); + Op.addRegOrImmWithInputModsOperands(Inst, 2); } else if (Op.isImm()) { // Handle optional arguments OptionalIdx[Op.getImmTy()] = I; @@ -4824,6 +5476,8 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op, return Operand.isAddr64() ? Match_Success : Match_InvalidOperand; case MCK_gds: return Operand.isGDS() ? Match_Success : Match_InvalidOperand; + case MCK_lds: + return Operand.isLDS() ? Match_Success : Match_InvalidOperand; case MCK_glc: return Operand.isGLC() ? Match_Success : Match_InvalidOperand; case MCK_idxen: diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td index 2230457b3a9b..b87c47a6b9ee 100644 --- a/lib/Target/AMDGPU/BUFInstructions.td +++ b/lib/Target/AMDGPU/BUFInstructions.td @@ -52,14 +52,19 @@ class getAddrName<int addrKind> { ""))))); } -class MUBUFAddr64Table <bit is_addr64, string suffix = ""> { +class MUBUFAddr64Table <bit is_addr64, string Name> { bit IsAddr64 = is_addr64; - string OpName = NAME # suffix; + string OpName = Name; } -class MTBUFAddr64Table <bit is_addr64, string suffix = ""> { +class MUBUFLdsTable <bit is_lds, string Name> { + bit IsLds = is_lds; + string OpName = Name; +} + +class MTBUFAddr64Table <bit is_addr64, string Name> { bit IsAddr64 = is_addr64; - string OpName = NAME # suffix; + string OpName = Name; } //===----------------------------------------------------------------------===// @@ -137,17 +142,17 @@ class getMTBUFInsDA<list<RegisterClass> vdataList, RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); dag InsNoData = !if(!empty(vaddrList), (ins SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe), + offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe), (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe) + offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe) ); dag InsData = !if(!empty(vaddrList), (ins vdataClass:$vdata, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, - slc:$slc, tfe:$tfe), + SLC:$slc, TFE:$tfe), (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, - slc:$slc, tfe:$tfe) + SLC:$slc, TFE:$tfe) ); dag ret = !if(!empty(vdataList), InsNoData, InsData); } @@ -214,13 +219,13 @@ multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>, - MTBUFAddr64Table<0>; + MTBUFAddr64Table<0, NAME>; def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>, - MTBUFAddr64Table<1>; + MTBUFAddr64Table<1, NAME>; def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; @@ -260,13 +265,13 @@ multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe))]>, - MTBUFAddr64Table<0>; + MTBUFAddr64Table<0, NAME>; def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe))]>, - MTBUFAddr64Table<1>; + MTBUFAddr64Table<1, NAME>; def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; def _IDXEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; @@ -310,6 +315,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins, bits<1> offen = 0; bits<1> idxen = 0; bits<1> addr64 = 0; + bits<1> lds = 0; bits<1> has_vdata = 1; bits<1> has_vaddr = 1; bits<1> has_glc = 1; @@ -336,7 +342,6 @@ class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> : bits<12> offset; bits<1> glc; - bits<1> lds = 0; bits<8> vaddr; bits<8> vdata; bits<7> srsrc; @@ -371,31 +376,35 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node> : } class getMUBUFInsDA<list<RegisterClass> vdataList, - list<RegisterClass> vaddrList=[]> { + list<RegisterClass> vaddrList=[], + bit isLds = 0> { RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList)); RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); dag InsNoData = !if(!empty(vaddrList), (ins SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe), + offset:$offset, GLC:$glc, SLC:$slc), (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe) + offset:$offset, GLC:$glc, SLC:$slc) ); dag InsData = !if(!empty(vaddrList), (ins vdataClass:$vdata, SReg_128:$srsrc, - SCSrc_b32:$soffset, offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe), + SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc), (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, - SCSrc_b32:$soffset, offset:$offset, GLC:$glc, slc:$slc, tfe:$tfe) + SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc) ); - dag ret = !if(!empty(vdataList), InsNoData, InsData); + dag ret = !con( + !if(!empty(vdataList), InsNoData, InsData), + !if(isLds, (ins), (ins TFE:$tfe)) + ); } -class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[]> { +class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit isLds = 0> { dag ret = - !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList>.ret, - !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32]>.ret, - !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32]>.ret, - !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64]>.ret, - !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64]>.ret, + !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isLds>.ret, + !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32], isLds>.ret, + !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32], isLds>.ret, + !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64], isLds>.ret, + !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64], isLds>.ret, (ins)))))); } @@ -426,20 +435,29 @@ class MUBUF_Load_Pseudo <string opName, int addrKind, RegisterClass vdataClass, bit HasTiedDest = 0, + bit isLds = 0, list<dag> pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind> : MUBUF_Pseudo<opName, (outs vdataClass:$vdata), - !con(getMUBUFIns<addrKindCopy>.ret, !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))), - " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe", + !con(getMUBUFIns<addrKindCopy, [], isLds>.ret, + !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))), + " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc" # + !if(isLds, " lds", "$tfe"), pattern>, MUBUF_SetupAddr<addrKindCopy> { - let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; + let PseudoInstr = opName # !if(isLds, "_lds", "") # + "_" # getAddrName<addrKindCopy>.ret; + let AsmMatchConverter = !if(isLds, "cvtMubufLds", "cvtMubuf"); + let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", ""); let mayLoad = 1; let mayStore = 0; let maybeAtomic = 1; + let Uses = !if(isLds, [EXEC, M0], [EXEC]); + let has_tfe = !if(isLds, 0, 1); + let lds = isLds; } // FIXME: tfe can't be an operand because it requires a separate @@ -447,32 +465,45 @@ class MUBUF_Load_Pseudo <string opName, multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, ValueType load_vt = i32, SDPatternOperator ld = null_frag, - bit TiedDest = 0> { + bit TiedDest = 0, + bit isLds = 0> { def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, - TiedDest, - [(set load_vt:$vdata, - (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>, - MUBUFAddr64Table<0>; + TiedDest, isLds, + !if(isLds, + [], + [(set load_vt:$vdata, + (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))])>, + MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>; def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, - TiedDest, - [(set load_vt:$vdata, - (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>, - MUBUFAddr64Table<1>; + TiedDest, isLds, + !if(isLds, + [], + [(set load_vt:$vdata, + (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))])>, + MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>; - def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest>; - def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest>; - def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest>; + def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>; + def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>; + def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>; let DisableWQM = 1 in { - def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest>; - def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest>; - def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest>; - def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest>; + def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest, isLds>; + def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>; + def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>; + def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>; } } +multiclass MUBUF_Pseudo_Loads_Lds<string opName, RegisterClass vdataClass, + ValueType load_vt = i32, + SDPatternOperator ld_nolds = null_frag, + SDPatternOperator ld_lds = null_frag> { + defm NAME : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_nolds>; + defm _LDS : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_lds, 0, 1>; +} + class MUBUF_Store_Pseudo <string opName, int addrKind, RegisterClass vdataClass, @@ -499,12 +530,12 @@ multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>, - MUBUFAddr64Table<0>; + MUBUFAddr64Table<0, NAME>; def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>, - MUBUFAddr64Table<1>; + MUBUFAddr64Table<1, NAME>; def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; @@ -518,6 +549,23 @@ multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, } } +class MUBUF_Pseudo_Store_Lds<string opName> + : MUBUF_Pseudo<opName, + (outs), + (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc), + " $srsrc, $soffset$offset lds$glc$slc"> { + let mayLoad = 0; + let mayStore = 1; + let maybeAtomic = 1; + + let has_vdata = 0; + let has_vaddr = 0; + let has_tfe = 0; + let lds = 1; + + let Uses = [EXEC, M0]; + let AsmMatchConverter = "cvtMubufLds"; +} class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in, list<RegisterClass> vaddrList=[]> { @@ -525,15 +573,15 @@ class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in, dag ret = !if(vdata_in, !if(!empty(vaddrList), (ins vdataClass:$vdata_in, - SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc), + SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc), (ins vdataClass:$vdata_in, vaddrClass:$vaddr, - SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc) + SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc) ), !if(!empty(vaddrList), (ins vdataClass:$vdata, - SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc), + SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc), (ins vdataClass:$vdata, vaddrClass:$vaddr, - SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, slc:$slc) + SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, SLC:$slc) )); } @@ -618,9 +666,9 @@ multiclass MUBUF_Pseudo_Atomics <string opName, SDPatternOperator atomic> { def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>, - MUBUFAddr64Table <0>; + MUBUFAddr64Table <0, NAME>; def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>, - MUBUFAddr64Table <1>; + MUBUFAddr64Table <1, NAME>; def _OFFEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; @@ -629,13 +677,13 @@ multiclass MUBUF_Pseudo_Atomics <string opName, [(set vdataType:$vdata, (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc), vdataType:$vdata_in))]>, - MUBUFAddr64Table <0, "_RTN">; + MUBUFAddr64Table <0, NAME # "_RTN">; def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, [(set vdataType:$vdata, (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc), vdataType:$vdata_in))]>, - MUBUFAddr64Table <1, "_RTN">; + MUBUFAddr64Table <1, NAME # "_RTN">; def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; @@ -647,7 +695,7 @@ multiclass MUBUF_Pseudo_Atomics <string opName, // MUBUF Instructions //===----------------------------------------------------------------------===// -defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads < +defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads_Lds < "buffer_load_format_x", VGPR_32 >; defm BUFFER_LOAD_FORMAT_XY : MUBUF_Pseudo_Loads < @@ -671,19 +719,74 @@ defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Pseudo_Stores < defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Pseudo_Stores < "buffer_store_format_xyzw", VReg_128 >; -defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads < + +let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in { + defm BUFFER_LOAD_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_x", VGPR_32 + >; + defm BUFFER_LOAD_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xy", VReg_64 + >; + defm BUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyz", VReg_96 + >; + defm BUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyzw", VReg_128 + >; + defm BUFFER_STORE_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_x", VGPR_32 + >; + defm BUFFER_STORE_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xy", VReg_64 + >; + defm BUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xyz", VReg_96 + >; + defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xyzw", VReg_128 + >; +} // End HasUnpackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in { + defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_x", VGPR_32 + >; + defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xy", VGPR_32 + >; + defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyz", VReg_64 + >; + defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_xyzw", VReg_64 + >; + defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_x", VGPR_32 + >; + defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xy", VGPR_32 + >; + defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xyz", VReg_64 + >; + defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_xyzw", VReg_64 + >; +} // End HasPackedD16VMem. + +defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads_Lds < "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8 >; -defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads < +defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads_Lds < "buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8 >; -defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads < +defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads_Lds < "buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16 >; -defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads < +defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads_Lds < "buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16 >; -defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads < +defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads_Lds < "buffer_load_dword", VGPR_32, i32, mubuf_load >; defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads < @@ -695,6 +798,22 @@ defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads < defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads < "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load >; + +// This is not described in AMD documentation, +// but 'lds' versions of these opcodes are available +// in at least GFX8+ chips. See Bug 37653. +let SubtargetPredicate = isVI in { +defm BUFFER_LOAD_DWORDX2_LDS : MUBUF_Pseudo_Loads < + "buffer_load_dwordx2", VReg_64, v2i32, null_frag, 0, 1 +>; +defm BUFFER_LOAD_DWORDX3_LDS : MUBUF_Pseudo_Loads < + "buffer_load_dwordx3", VReg_96, untyped, null_frag, 0, 1 +>; +defm BUFFER_LOAD_DWORDX4_LDS : MUBUF_Pseudo_Loads < + "buffer_load_dwordx4", VReg_128, v4i32, null_frag, 0, 1 +>; +} + defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores < "buffer_store_byte", VGPR_32, i32, truncstorei8_global >; @@ -792,6 +911,10 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics < "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global >; +let SubtargetPredicate = isVI in { +def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">; +} + let SubtargetPredicate = isSI in { // isn't on CI & VI /* defm BUFFER_ATOMIC_RSUB : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub">; @@ -842,6 +965,13 @@ defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores < "buffer_store_short_d16_hi", VGPR_32, i32 >; +defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_hi_x", VGPR_32 +>; +defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores < + "buffer_store_format_d16_hi_x", VGPR_32 +>; + } // End HasD16LoadStore def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1", @@ -860,6 +990,28 @@ defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>; defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>; +let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in { + defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; + defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128>; + defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>; + defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64>; + defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96>; + defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128>; +} // End HasUnpackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in { + defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; + defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64>; + defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>; + defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32>; + defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64>; + defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>; +} // End HasPackedD16VMem. + let SubtargetPredicate = isCIVI in { //===----------------------------------------------------------------------===// @@ -922,6 +1074,19 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">; + +let SubtargetPredicate = HasUnpackedD16VMem in { + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">; + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">; + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">; +} // End HasUnpackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">; + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">; + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZW">; +} // End HasPackedD16VMem. + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">; @@ -969,6 +1134,19 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_X">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">; + +let SubtargetPredicate = HasUnpackedD16VMem in { + defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X_gfx80">; + defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XY_gfx80">; + defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i32, "BUFFER_STORE_FORMAT_D16_XYZW_gfx80">; +} // End HasUnpackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { + defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X">; + defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2f16, "BUFFER_STORE_FORMAT_D16_XY">; + defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZW">; +} // End HasPackedD16VMem. + defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">; @@ -1210,7 +1388,7 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>; -let OtherPredicates = [HasD16LoadStore] in { +let OtherPredicates = [D16PreservesUnusedBits] in { defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, i16, load_private>; defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, i16, az_extloadi8_private>; defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, i16, sextloadi8_private>; @@ -1325,7 +1503,7 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OF defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private>; -let OtherPredicates = [HasD16LoadStore] in { +let OtherPredicates = [D16PreservesUnusedBits] in { // Hiding the extract high pattern in the PatFrag seems to not // automatically increase the complexity. let AddedComplexity = 1 in { @@ -1382,6 +1560,18 @@ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f32, "TBUFFER_LOAD_FORMAT_X">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">; defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">; +let SubtargetPredicate = HasUnpackedD16VMem in { + defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">; + defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">; + defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">; +} // End HasUnpackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { + defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X">; + defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">; + defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4f16, "TBUFFER_LOAD_FORMAT_D16_XYZW">; +} // End HasPackedD16VMem. + multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode> { def : GCNPat< @@ -1431,6 +1621,18 @@ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY" defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">; +let SubtargetPredicate = HasUnpackedD16VMem in { + defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X_gfx80">; + defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XY_gfx80">; + defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4i32, "TBUFFER_STORE_FORMAT_D16_XYZW_gfx80">; +} // End HasUnpackedD16VMem. + +let SubtargetPredicate = HasPackedD16VMem in { + defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X">; + defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">; + defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4f16, "TBUFFER_STORE_FORMAT_D16_XYZW">; +} // End HasPackedD16VMem. + //===----------------------------------------------------------------------===// // Target instructions, move to the appropriate target TD file //===----------------------------------------------------------------------===// @@ -1451,7 +1653,7 @@ class MUBUF_Real_si <bits<7> op, MUBUF_Pseudo ps> : let Inst{13} = ps.idxen; let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); let Inst{15} = ps.addr64; - let Inst{16} = lds; + let Inst{16} = !if(ps.lds, 1, 0); let Inst{24-18} = op; let Inst{31-26} = 0x38; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); @@ -1470,6 +1672,31 @@ multiclass MUBUF_Real_AllAddr_si<bits<7> op> { def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>; } +multiclass MUBUF_Real_AllAddr_Lds_si<bits<7> op> { + + def _OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>, + MUBUFLdsTable<0, NAME # "_OFFSET_si">; + def _ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64")>, + MUBUFLdsTable<0, NAME # "_ADDR64_si">; + def _OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>, + MUBUFLdsTable<0, NAME # "_OFFEN_si">; + def _IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>, + MUBUFLdsTable<0, NAME # "_IDXEN_si">; + def _BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>, + MUBUFLdsTable<0, NAME # "_BOTHEN_si">; + + def _LDS_OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>, + MUBUFLdsTable<1, NAME # "_OFFSET_si">; + def _LDS_ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_ADDR64")>, + MUBUFLdsTable<1, NAME # "_ADDR64_si">; + def _LDS_OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>, + MUBUFLdsTable<1, NAME # "_OFFEN_si">; + def _LDS_IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>, + MUBUFLdsTable<1, NAME # "_IDXEN_si">; + def _LDS_BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>, + MUBUFLdsTable<1, NAME # "_BOTHEN_si">; +} + multiclass MUBUF_Real_Atomic_si<bits<7> op> : MUBUF_Real_AllAddr_si<op> { def _OFFSET_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>; def _ADDR64_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64_RTN")>; @@ -1478,7 +1705,7 @@ multiclass MUBUF_Real_Atomic_si<bits<7> op> : MUBUF_Real_AllAddr_si<op> { def _BOTHEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>; } -defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_si <0x00>; +defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_Lds_si <0x00>; defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_si <0x01>; defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x02>; defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x03>; @@ -1486,11 +1713,11 @@ defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_si <0x04>; defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_si <0x05>; defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_si <0x06>; defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_si <0x07>; -defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_si <0x08>; -defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_si <0x09>; -defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_si <0x0a>; -defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_si <0x0b>; -defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_si <0x0c>; +defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_Lds_si <0x08>; +defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_Lds_si <0x09>; +defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_Lds_si <0x0a>; +defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_Lds_si <0x0b>; +defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_Lds_si <0x0c>; defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_si <0x0d>; defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_si <0x0e>; defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_si <0x0f>; @@ -1575,7 +1802,7 @@ multiclass MTBUF_Real_AllAddr_si<bits<3> op> { defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_si <0>; defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_si <1>; -//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_si <2>; +defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_si <2>; defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_si <3>; defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_si <4>; defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_si <5>; @@ -1610,7 +1837,7 @@ class MUBUF_Real_vi <bits<7> op, MUBUF_Pseudo ps> : let Inst{12} = ps.offen; let Inst{13} = ps.idxen; let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); - let Inst{16} = lds; + let Inst{16} = !if(ps.lds, 1, 0); let Inst{17} = !if(ps.has_slc, slc, ?); let Inst{24-18} = op; let Inst{31-26} = 0x38; //encoding @@ -1628,6 +1855,56 @@ multiclass MUBUF_Real_AllAddr_vi<bits<7> op> { def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>; } +multiclass MUBUF_Real_AllAddr_Lds_vi<bits<7> op> { + + def _OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>, + MUBUFLdsTable<0, NAME # "_OFFSET_vi">; + def _OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>, + MUBUFLdsTable<0, NAME # "_OFFEN_vi">; + def _IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>, + MUBUFLdsTable<0, NAME # "_IDXEN_vi">; + def _BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>, + MUBUFLdsTable<0, NAME # "_BOTHEN_vi">; + + def _LDS_OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFSET")>, + MUBUFLdsTable<1, NAME # "_OFFSET_vi">; + def _LDS_OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_OFFEN")>, + MUBUFLdsTable<1, NAME # "_OFFEN_vi">; + def _LDS_IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_IDXEN")>, + MUBUFLdsTable<1, NAME # "_IDXEN_vi">; + def _LDS_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>, + MUBUFLdsTable<1, NAME # "_BOTHEN_vi">; +} + +class MUBUF_Real_gfx80 <bits<7> op, MUBUF_Pseudo ps> : + MUBUF_Real<op, ps>, + Enc64, + SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX80> { + let AssemblerPredicate=HasUnpackedD16VMem; + let DecoderNamespace="GFX80_UNPACKED"; + + let Inst{11-0} = !if(ps.has_offset, offset, ?); + let Inst{12} = ps.offen; + let Inst{13} = ps.idxen; + let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); + let Inst{16} = !if(ps.lds, 1, 0); + let Inst{17} = !if(ps.has_slc, slc, ?); + let Inst{24-18} = op; + let Inst{31-26} = 0x38; //encoding + let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); + let Inst{55} = !if(ps.has_tfe, tfe, ?); + let Inst{63-56} = !if(ps.has_soffset, soffset, ?); +} + +multiclass MUBUF_Real_AllAddr_gfx80<bits<7> op> { + def _OFFSET_gfx80 : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>; + def _OFFEN_gfx80 : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN")>; + def _IDXEN_gfx80 : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN")>; + def _BOTHEN_gfx80 : MUBUF_Real_gfx80 <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>; +} + multiclass MUBUF_Real_Atomic_vi<bits<7> op> : MUBUF_Real_AllAddr_vi<op> { def _OFFSET_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>; @@ -1636,7 +1913,7 @@ multiclass MUBUF_Real_Atomic_vi<bits<7> op> : def _BOTHEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>; } -defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_vi <0x00>; +defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_Lds_vi <0x00>; defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_vi <0x01>; defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_vi <0x02>; defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_vi <0x03>; @@ -1644,14 +1921,34 @@ defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_vi <0x04>; defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_vi <0x05>; defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_vi <0x06>; defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_vi <0x07>; -defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_vi <0x10>; -defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_vi <0x11>; -defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_vi <0x12>; -defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_vi <0x13>; -defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_vi <0x14>; -defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_vi <0x15>; -defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_vi <0x16>; -defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_vi <0x17>; +let SubtargetPredicate = HasUnpackedD16VMem in { + defm BUFFER_LOAD_FORMAT_D16_X_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x08>; + defm BUFFER_LOAD_FORMAT_D16_XY_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x09>; + defm BUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0a>; + defm BUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0b>; + defm BUFFER_STORE_FORMAT_D16_X_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0c>; + defm BUFFER_STORE_FORMAT_D16_XY_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0d>; + defm BUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0e>; + defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MUBUF_Real_AllAddr_gfx80 <0x0f>; +} // End HasUnpackedD16VMem. +let SubtargetPredicate = HasPackedD16VMem in { + defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Real_AllAddr_vi <0x08>; + defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Real_AllAddr_vi <0x09>; + defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_vi <0x0a>; + defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_vi <0x0b>; + defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Real_AllAddr_vi <0x0c>; + defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Real_AllAddr_vi <0x0d>; + defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_vi <0x0e>; + defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_vi <0x0f>; +} // End HasPackedD16VMem. +defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_Lds_vi <0x10>; +defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_Lds_vi <0x11>; +defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_Lds_vi <0x12>; +defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_Lds_vi <0x13>; +defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_Lds_vi <0x14>; +defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_Lds_vi <0x15>; +defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_Lds_vi <0x16>; +defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_Lds_vi <0x17>; defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_vi <0x18>; defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_vi <0x19>; defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_vi <0x1a>; @@ -1668,6 +1965,9 @@ defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Real_AllAddr_vi <0x23>; defm BUFFER_LOAD_SHORT_D16 : MUBUF_Real_AllAddr_vi <0x24>; defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Real_AllAddr_vi <0x25>; +defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_vi <0x26>; +defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_vi <0x27>; + defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_vi <0x40>; defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_vi <0x41>; defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomic_vi <0x42>; @@ -1696,6 +1996,8 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_vi <0x6a>; defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_vi <0x6b>; defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_vi <0x6c>; +def BUFFER_STORE_LDS_DWORD_vi : MUBUF_Real_vi <0x3d, BUFFER_STORE_LDS_DWORD>; + def BUFFER_WBINVL1_vi : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>; def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>; @@ -1729,11 +2031,61 @@ multiclass MTBUF_Real_AllAddr_vi<bits<4> op> { def _BOTHEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>; } -defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_vi <0>; -defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_vi <1>; -//defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <2>; -defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <3>; -defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_vi <4>; -defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_vi <5>; -defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <6>; -defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>; +class MTBUF_Real_gfx80 <bits<4> op, MTBUF_Pseudo ps> : + MTBUF_Real<ps>, + Enc64, + SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX80> { + let AssemblerPredicate=HasUnpackedD16VMem; + let DecoderNamespace="GFX80_UNPACKED"; + + let Inst{11-0} = !if(ps.has_offset, offset, ?); + let Inst{12} = ps.offen; + let Inst{13} = ps.idxen; + let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); + let Inst{18-15} = op; + let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value); + let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value); + let Inst{31-26} = 0x3a; //encoding + let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); + let Inst{47-40} = !if(ps.has_vdata, vdata, ?); + let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?); + let Inst{54} = !if(ps.has_slc, slc, ?); + let Inst{55} = !if(ps.has_tfe, tfe, ?); + let Inst{63-56} = !if(ps.has_soffset, soffset, ?); +} + +multiclass MTBUF_Real_AllAddr_gfx80<bits<4> op> { + def _OFFSET_gfx80 : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>; + def _OFFEN_gfx80 : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>; + def _IDXEN_gfx80 : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>; + def _BOTHEN_gfx80 : MTBUF_Real_gfx80 <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>; +} + +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_vi <0x00>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_vi <0x01>; +defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <0x02>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <0x03>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_vi <0x04>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_vi <0x05>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_vi <0x06>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <0x07>; +let SubtargetPredicate = HasUnpackedD16VMem in { + defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x08>; + defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x09>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0a>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0b>; + defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0c>; + defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0d>; + defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0e>; + defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Real_AllAddr_gfx80 <0x0f>; +} // End HasUnpackedD16VMem. +let SubtargetPredicate = HasPackedD16VMem in { + defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_vi <0x08>; + defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_vi <0x09>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_vi <0x0a>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0b>; + defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Real_AllAddr_vi <0x0c>; + defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Real_AllAddr_vi <0x0d>; + defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_vi <0x0e>; + defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0f>; +} // End HasUnpackedD16VMem. diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index 3a8503030414..174b2df15300 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -1,18 +1,33 @@ set(LLVM_TARGET_DEFINITIONS AMDGPU.td) -tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info) -tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info) -tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel) -tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv) -tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget) -tablegen(LLVM AMDGPUGenIntrinsics.inc -gen-tgt-intrinsic) -tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter) -tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer) -tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer) tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher) +tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer) +tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv) +tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel) tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler) +tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info) +tablegen(LLVM AMDGPUGenIntrinsicEnums.inc -gen-tgt-intrinsic-enums) +tablegen(LLVM AMDGPUGenIntrinsicImpl.inc -gen-tgt-intrinsic-impl) +tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering) tablegen(LLVM AMDGPUGenRegisterBank.inc -gen-register-bank) +tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info) +tablegen(LLVM AMDGPUGenSearchableTables.inc -gen-searchable-tables) +tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget) + +set(LLVM_TARGET_DEFINITIONS AMDGPUGISel.td) +tablegen(LLVM AMDGPUGenGlobalISel.inc -gen-global-isel) + +set(LLVM_TARGET_DEFINITIONS R600.td) +tablegen(LLVM R600GenAsmWriter.inc -gen-asm-writer) +tablegen(LLVM R600GenCallingConv.inc -gen-callingconv) +tablegen(LLVM R600GenDAGISel.inc -gen-dag-isel) +tablegen(LLVM R600GenDFAPacketizer.inc -gen-dfa-packetizer) +tablegen(LLVM R600GenInstrInfo.inc -gen-instr-info) +tablegen(LLVM R600GenMCCodeEmitter.inc -gen-emitter) +tablegen(LLVM R600GenRegisterInfo.inc -gen-register-info) +tablegen(LLVM R600GenSubtargetInfo.inc -gen-subtarget) + add_public_tablegen_target(AMDGPUCommonTableGen) add_llvm_target(AMDGPUCodeGen @@ -25,6 +40,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUCallLowering.cpp AMDGPUCodeGenPrepare.cpp AMDGPUFrameLowering.cpp + AMDGPUHSAMetadataStreamer.cpp AMDGPUInstrInfo.cpp AMDGPUInstructionSelector.cpp AMDGPUIntrinsicInfo.cpp @@ -34,13 +50,14 @@ add_llvm_target(AMDGPUCodeGen AMDGPULibCalls.cpp AMDGPULibFunc.cpp AMDGPULowerIntrinsics.cpp + AMDGPULowerKernelArguments.cpp + AMDGPULowerKernelAttributes.cpp AMDGPUMachineCFGStructurizer.cpp AMDGPUMachineFunction.cpp AMDGPUMachineModuleInfo.cpp AMDGPUMacroFusion.cpp AMDGPUMCInstLower.cpp AMDGPUOpenCLEnqueuedBlockLowering.cpp - AMDGPUOpenCLImageTypeLoweringPass.cpp AMDGPUPromoteAlloca.cpp AMDGPURegAsmNames.inc.cpp AMDGPURegisterBankInfo.cpp @@ -53,12 +70,14 @@ add_llvm_target(AMDGPUCodeGen AMDGPUUnifyDivergentExitNodes.cpp AMDGPUUnifyMetadata.cpp AMDGPUInline.cpp + AMDGPUPerfHintAnalysis.cpp AMDILCFGStructurizer.cpp GCNHazardRecognizer.cpp GCNIterativeScheduler.cpp GCNMinRegStrategy.cpp GCNRegPressure.cpp GCNSchedStrategy.cpp + R600AsmPrinter.cpp R600ClauseMergePass.cpp R600ControlFlowFinalizer.cpp R600EmitClauseMarkers.cpp @@ -68,6 +87,7 @@ add_llvm_target(AMDGPUCodeGen R600ISelLowering.cpp R600MachineFunctionInfo.cpp R600MachineScheduler.cpp + R600OpenCLImageTypeLoweringPass.cpp R600OptimizeVectorRegisters.cpp R600Packetizer.cpp R600RegisterInfo.cpp @@ -77,10 +97,10 @@ add_llvm_target(AMDGPUCodeGen SIFixVGPRCopies.cpp SIFixWWMLiveness.cpp SIFoldOperands.cpp + SIFormMemoryClauses.cpp SIFrameLowering.cpp SIInsertSkips.cpp SIInsertWaitcnts.cpp - SIInsertWaits.cpp SIInstrInfo.cpp SIISelLowering.cpp SILoadStoreOptimizer.cpp @@ -99,8 +119,8 @@ add_llvm_target(AMDGPUCodeGen ) add_subdirectory(AsmParser) -add_subdirectory(InstPrinter) add_subdirectory(Disassembler) -add_subdirectory(TargetInfo) +add_subdirectory(InstPrinter) add_subdirectory(MCTargetDesc) +add_subdirectory(TargetInfo) add_subdirectory(Utils) diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td index f898fd7948cc..cdc6ab9412e6 100644 --- a/lib/Target/AMDGPU/DSInstructions.td +++ b/lib/Target/AMDGPU/DSInstructions.td @@ -440,7 +440,7 @@ defm DS_XOR_RTN_B32 : DS_1A1D_RET_mc<"ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">; defm DS_MSKOR_RTN_B32 : DS_1A2D_RET_mc<"ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">; defm DS_CMPST_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">; defm DS_CMPST_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; -defm DS_MIN_RTN_F32 : DS_1A1D_RET_mc <"ds_min_rtn_f32", VGPR_32, "ds_min_f32">; +defm DS_MIN_RTN_F32 : DS_1A1D_RET_mc<"ds_min_rtn_f32", VGPR_32, "ds_min_f32">; defm DS_MAX_RTN_F32 : DS_1A1D_RET_mc<"ds_max_rtn_f32", VGPR_32, "ds_max_f32">; defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b32">; @@ -584,6 +584,8 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32", int_amdgcn_ds_bpermute>; } +def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">; + } // let SubtargetPredicate = isVI //===----------------------------------------------------------------------===// @@ -600,8 +602,6 @@ class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < (inst $ptr, (as_i16imm $offset), (i1 0)) >; -// FIXME: Passing name of PatFrag in workaround. Why doesn't -// !cast<PatFrag>(frag.NAME#"_m0") work!? multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> { let OtherPredicates = [LDSRequiresM0Init] in { @@ -609,7 +609,7 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> { } let OtherPredicates = [NotLDSRequiresM0Init] in { - def : DSReadPat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>; + def : DSReadPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>; } } @@ -647,14 +647,17 @@ defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">; defm : DSReadPat_mc <DS_READ_U16, i32, "az_extloadi16_local">; defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">; defm : DSReadPat_mc <DS_READ_B32, i32, "load_local">; +defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">; +defm : DSReadPat_mc <DS_READ_B64, i64, "atomic_load_64_local">; let AddedComplexity = 100 in { defm : DSReadPat_mc <DS_READ_B64, v2i32, "load_align8_local">; +defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">; } // End AddedComplexity = 100 -let OtherPredicates = [HasD16LoadStore] in { +let OtherPredicates = [D16PreservesUnusedBits] in { let AddedComplexity = 100 in { defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>; defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>; @@ -678,7 +681,24 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> { } let OtherPredicates = [NotLDSRequiresM0Init] in { - def : DSWritePat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>; + def : DSWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>; + } +} + +// Irritatingly, atomic_store reverses the order of operands from a +// normal store. +class DSAtomicWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < + (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), + (inst $ptr, $value, (as_i16imm $offset), (i1 0)) +>; + +multiclass DSAtomicWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> { + let OtherPredicates = [LDSRequiresM0Init] in { + def : DSAtomicWritePat<inst, vt, !cast<PatFrag>(frag#"_m0")>; + } + + let OtherPredicates = [NotLDSRequiresM0Init] in { + def : DSAtomicWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>; } } @@ -687,8 +707,10 @@ defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">; defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">; defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">; defm : DSWritePat_mc <DS_WRITE_B32, i32, "store_local">; +defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local">; +defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local">; -let OtherPredicates = [HasD16LoadStore] in { +let OtherPredicates = [D16PreservesUnusedBits] in { def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_local_hi16>; def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_local_hi16>; } @@ -720,6 +742,8 @@ def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>; let AddedComplexity = 100 in { defm : DSWritePat_mc <DS_WRITE_B64, v2i32, "store_align8_local">; +defm : DSWritePat_mc <DS_WRITE_B128, v4i32, "store_align16_local">; + } // End AddedComplexity = 100 class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), @@ -732,7 +756,8 @@ multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { } let OtherPredicates = [NotLDSRequiresM0Init] in { - def : DSAtomicRetPat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>; + def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, + !cast<PatFrag>(frag)>; } } @@ -749,7 +774,8 @@ multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> { } let OtherPredicates = [NotLDSRequiresM0Init] in { - def : DSAtomicCmpXChg<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>; + def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, + !cast<PatFrag>(frag)>; } } @@ -769,6 +795,9 @@ defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max_local">; defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin_local">; defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax_local">; defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap_local">; +defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin_local">; +defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax_local">; +defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd_local">; // 64-bit atomics. defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap_local">; @@ -1123,6 +1152,7 @@ def DS_XOR_SRC2_B32_vi : DS_Real_vi<0x8b, DS_XOR_SRC2_B32>; def DS_WRITE_SRC2_B32_vi : DS_Real_vi<0x8d, DS_WRITE_SRC2_B32>; def DS_MIN_SRC2_F32_vi : DS_Real_vi<0x92, DS_MIN_SRC2_F32>; def DS_MAX_SRC2_F32_vi : DS_Real_vi<0x93, DS_MAX_SRC2_F32>; +def DS_ADD_SRC2_F32_vi : DS_Real_vi<0x95, DS_ADD_SRC2_F32>; def DS_ADD_SRC2_U64_vi : DS_Real_vi<0xc0, DS_ADD_SRC2_U64>; def DS_SUB_SRC2_U64_vi : DS_Real_vi<0xc1, DS_SUB_SRC2_U64>; def DS_RSUB_SRC2_U64_vi : DS_Real_vi<0xc2, DS_RSUB_SRC2_U64>; diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 47a2d3f2fdc5..f3de903f21b2 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -20,7 +20,9 @@ #include "Disassembler/AMDGPUDisassembler.h" #include "AMDGPU.h" #include "AMDGPURegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm-c/Disassembler.h" #include "llvm/ADT/APInt.h" @@ -198,6 +200,21 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address); if (Res) { IsSDWA = true; break; } + + if (STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]) { + Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address); + if (Res) + break; + } + + // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and + // v_mad_mixhi_f16 for FMA variants. Try to decode using this special + // table first so we print the correct name. + if (STI.getFeatureBits()[AMDGPU::FeatureFmaMixInsts]) { + Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address); + if (Res) + break; + } } // Reinitialize Bytes as DPP64 could have eaten too much @@ -228,7 +245,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi || MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si || - MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi)) { + MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi || + MI.getOpcode() == AMDGPU::V_FMAC_F32_e64_vi)) { // Insert dummy unused src2_modifiers. insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::src2_modifiers); @@ -241,7 +259,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, if (Res && IsSDWA) Res = convertSDWAInst(MI); - Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0; + // if the opcode was not recognized we'll assume a Size of 4 bytes + // (unless there are fewer bytes left) + Size = Res ? (MaxInstBytesNum - Bytes.size()) + : std::min((size_t)4, Bytes_.size()); return Res; } @@ -264,26 +285,70 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { return MCDisassembler::Success; } +// Note that MIMG format provides no information about VADDR size. +// Consequently, decoded instructions always show address +// as if it has 1 dword, which could be not really so. DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { + + int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::vdst); + int VDataIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dmask); + + int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::tfe); + int D16Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::d16); + + assert(VDataIdx != -1); + assert(DMaskIdx != -1); + assert(TFEIdx != -1); + + bool IsAtomic = (VDstIdx != -1); + bool IsGather4 = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::Gather4; + unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf; if (DMask == 0) return MCDisassembler::Success; - unsigned ChannelCount = countPopulation(DMask); - if (ChannelCount == 1) + unsigned DstSize = IsGather4 ? 4 : countPopulation(DMask); + if (DstSize == 1) return MCDisassembler::Success; - int NewOpcode = AMDGPU::getMaskedMIMGOp(*MCII, MI.getOpcode(), ChannelCount); - assert(NewOpcode != -1 && "could not find matching mimg channel instruction"); + bool D16 = D16Idx >= 0 && MI.getOperand(D16Idx).getImm(); + if (D16 && AMDGPU::hasPackedD16(STI)) { + DstSize = (DstSize + 1) / 2; + } + + // FIXME: Add tfe support + if (MI.getOperand(TFEIdx).getImm()) + return MCDisassembler::Success; + + int NewOpcode = -1; + + if (IsGather4) { + if (D16 && AMDGPU::hasPackedD16(STI)) + NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), 2); + else + return MCDisassembler::Success; + } else { + NewOpcode = AMDGPU::getMaskedMIMGOp(MI.getOpcode(), DstSize); + if (NewOpcode == -1) + return MCDisassembler::Success; + } + auto RCID = MCII->get(NewOpcode).OpInfo[VDataIdx].RegClass; - // Widen the register to the correct number of enabled channels. + // Get first subregister of VData unsigned Vdata0 = MI.getOperand(VDataIdx).getReg(); + unsigned VdataSub0 = MRI.getSubReg(Vdata0, AMDGPU::sub0); + Vdata0 = (VdataSub0 != 0)? VdataSub0 : Vdata0; + + // Widen the register to the correct number of enabled channels. auto NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0, &MRI.getRegClass(RCID)); if (NewVdata == AMDGPU::NoRegister) { @@ -297,6 +362,12 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { // how it is usually emitted because the number of register components is not // in the instruction encoding. MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata); + + if (IsAtomic) { + // Atomic operations have an additional operand (a copy of data) + MI.getOperand(VDstIdx) = MCOperand::createReg(NewVdata); + } + return MCDisassembler::Success; } @@ -690,9 +761,8 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const { switch (Val) { case 102: return createRegOperand(FLAT_SCR_LO); case 103: return createRegOperand(FLAT_SCR_HI); - // ToDo: no support for xnack_mask_lo/_hi register - case 104: - case 105: break; + case 104: return createRegOperand(XNACK_MASK_LO); + case 105: return createRegOperand(XNACK_MASK_HI); case 106: return createRegOperand(VCC_LO); case 107: return createRegOperand(VCC_HI); case 108: assert(!isGFX9()); return createRegOperand(TBA_LO); @@ -722,6 +792,7 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { switch (Val) { case 102: return createRegOperand(FLAT_SCR); + case 104: return createRegOperand(XNACK_MASK); case 106: return createRegOperand(VCC); case 108: assert(!isGFX9()); return createRegOperand(TBA); case 110: assert(!isGFX9()); return createRegOperand(TMA); @@ -732,8 +803,9 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { } MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width, - unsigned Val) const { + const unsigned Val) const { using namespace AMDGPU::SDWA; + using namespace AMDGPU::EncValues; if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) { // XXX: static_cast<int> is needed to avoid stupid warning: @@ -754,7 +826,15 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width, Val - SDWA9EncValues::SRC_TTMP_MIN); } - return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN); + const unsigned SVal = Val - SDWA9EncValues::SRC_SGPR_MIN; + + if (INLINE_INTEGER_C_MIN <= SVal && SVal <= INLINE_INTEGER_C_MAX) + return decodeIntImmed(SVal); + + if (INLINE_FLOATING_C_MIN <= SVal && SVal <= INLINE_FLOATING_C_MAX) + return decodeFPImmed(Width, SVal); + + return decodeSpecialReg32(SVal); } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) { return createRegOperand(getVgprClassId(Width), Val); } @@ -815,6 +895,9 @@ bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst, } auto *Symbols = static_cast<SectionSymbolsTy *>(DisInfo); + if (!Symbols) + return false; + auto Result = std::find_if(Symbols->begin(), Symbols->end(), [Value](const SymbolInfoTy& Val) { return std::get<0>(Val) == static_cast<uint64_t>(Value) diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td index 5e26f97b0c86..944f4ffe598d 100644 --- a/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/lib/Target/AMDGPU/EvergreenInstructions.td @@ -15,7 +15,6 @@ def isEG : Predicate< "Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && " - "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && " "!Subtarget->hasCaymanISA()" >; @@ -693,7 +692,7 @@ def : EGOrCaymanPat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>; def : EGOrCaymanPat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>; // SHA-256 Patterns -def : SHA256MaPattern <BFI_INT_eg, XOR_INT>; +defm : SHA256MaPattern <BFI_INT_eg, XOR_INT, R600_Reg64>; def EG_ExportSwz : ExportSwzInst { let Word1{19-16} = 0; // BURST_COUNT diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td index 693869128081..3ef473b7fd96 100644 --- a/lib/Target/AMDGPU/FLATInstructions.td +++ b/lib/Target/AMDGPU/FLATInstructions.td @@ -135,7 +135,7 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass, !con((ins VReg_64:$vaddr), !if(EnableSaddr, (ins SReg_64:$saddr), (ins))), (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)), - (ins GLC:$glc, slc:$slc)), + (ins GLC:$glc, SLC:$slc)), !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))), " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> { let has_data = 0; @@ -158,7 +158,7 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, !con((ins VReg_64:$vaddr, vdataClass:$vdata), !if(EnableSaddr, (ins SReg_64:$saddr), (ins))), (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)), - (ins GLC:$glc, slc:$slc)), + (ins GLC:$glc, SLC:$slc)), " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> { let mayLoad = 0; let mayStore = 1; @@ -188,8 +188,8 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass, opName, (outs regClass:$vdst), !if(EnableSaddr, - (ins SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, slc:$slc), - (ins VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, slc:$slc)), + (ins SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, SLC:$slc), + (ins VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, SLC:$slc)), " $vdst, "#!if(EnableSaddr, "off", "$vaddr")#!if(EnableSaddr, ", $saddr", ", off")#"$offset$glc$slc"> { let has_data = 0; let mayLoad = 1; @@ -204,8 +204,8 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En opName, (outs), !if(EnableSaddr, - (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, slc:$slc), - (ins vdataClass:$vdata, VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, slc:$slc)), + (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, SLC:$slc), + (ins vdataClass:$vdata, VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, SLC:$slc)), " "#!if(EnableSaddr, "off", "$vaddr")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc"> { let mayLoad = 0; let mayStore = 1; @@ -260,7 +260,7 @@ multiclass FLAT_Atomic_Pseudo< RegisterClass data_rc = vdst_rc> { def "" : FLAT_AtomicNoRet_Pseudo <opName, (outs), - (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc), + (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc), " $vaddr, $vdata$offset$slc">, AtomicNoRet <opName, 0> { let PseudoInstr = NAME; @@ -268,7 +268,7 @@ multiclass FLAT_Atomic_Pseudo< def _RTN : FLAT_AtomicRet_Pseudo <opName, (outs vdst_rc:$vdst), - (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc), + (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc), " $vdst, $vaddr, $vdata$offset glc$slc", [(set vt:$vdst, (atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>, @@ -285,7 +285,7 @@ multiclass FLAT_Global_Atomic_Pseudo< def "" : FLAT_AtomicNoRet_Pseudo <opName, (outs), - (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc), + (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc), " $vaddr, $vdata, off$offset$slc">, AtomicNoRet <opName, 0> { let has_saddr = 1; @@ -294,7 +294,7 @@ multiclass FLAT_Global_Atomic_Pseudo< def _RTN : FLAT_AtomicRet_Pseudo <opName, (outs vdst_rc:$vdst), - (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc), + (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc), " $vdst, $vaddr, $vdata, off$offset glc$slc", [(set vt:$vdst, (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>, @@ -304,7 +304,7 @@ multiclass FLAT_Global_Atomic_Pseudo< def _SADDR : FLAT_AtomicNoRet_Pseudo <opName, (outs), - (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, slc:$slc), + (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc), " $vaddr, $vdata, $saddr$offset$slc">, AtomicNoRet <opName#"_saddr", 0> { let has_saddr = 1; @@ -314,7 +314,7 @@ multiclass FLAT_Global_Atomic_Pseudo< def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName, (outs vdst_rc:$vdst), - (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, slc:$slc), + (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc), " $vdst, $vaddr, $vdata, $saddr$offset glc$slc">, AtomicNoRet <opName#"_saddr", 1> { let has_saddr = 1; @@ -780,7 +780,7 @@ def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>; def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>; def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>; -let OtherPredicates = [HasD16LoadStore] in { +let OtherPredicates = [D16PreservesUnusedBits] in { def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>; def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>; @@ -824,7 +824,7 @@ def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32>; def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32>; def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32>; -let OtherPredicates = [HasD16LoadStore] in { +let OtherPredicates = [D16PreservesUnusedBits] in { def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>; def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>; diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index dd515b0bf2f1..f236f10ba75a 100644 --- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -16,6 +16,7 @@ #include "SIDefines.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/MachineFunction.h" @@ -39,7 +40,7 @@ using namespace llvm; GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : CurrCycleInstr(nullptr), MF(MF), - ST(MF.getSubtarget<SISubtarget>()), + ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()), ClauseUses(TRI.getNumRegUnits()), @@ -355,13 +356,13 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { } int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); int WaitStatesNeeded = 0; WaitStatesNeeded = checkSoftClauseHazards(SMRD); // This SMRD hazard only affects SI. - if (ST.getGeneration() != SISubtarget::SOUTHERN_ISLANDS) + if (ST.getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS) return WaitStatesNeeded; // A read of an SGPR by SMRD instruction requires 4 wait states when the @@ -398,7 +399,7 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { } int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { - if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) + if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) return 0; int WaitStatesNeeded = checkSoftClauseHazards(VMEM); @@ -634,7 +635,7 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) { } int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) { - if (MI->isDebugValue()) + if (MI->isDebugInstr()) return 0; const SIRegisterInfo *TRI = ST.getRegisterInfo(); diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.h b/lib/Target/AMDGPU/GCNHazardRecognizer.h index f9a6e395a454..ca17e7cb6018 100644 --- a/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -28,7 +28,7 @@ class MachineRegisterInfo; class ScheduleDAG; class SIInstrInfo; class SIRegisterInfo; -class SISubtarget; +class GCNSubtarget; class GCNHazardRecognizer final : public ScheduleHazardRecognizer { // This variable stores the instruction that has been emitted this cycle. It @@ -37,7 +37,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { MachineInstr *CurrCycleInstr; std::list<MachineInstr*> EmittedInstrs; const MachineFunction &MF; - const SISubtarget &ST; + const GCNSubtarget &ST; const SIInstrInfo &TII; const SIRegisterInfo &TRI; diff --git a/lib/Target/AMDGPU/GCNILPSched.cpp b/lib/Target/AMDGPU/GCNILPSched.cpp index ba8211b189cf..651091d44136 100644 --- a/lib/Target/AMDGPU/GCNILPSched.cpp +++ b/lib/Target/AMDGPU/GCNILPSched.cpp @@ -149,9 +149,9 @@ static int BUCompareLatency(const SUnit *left, const SUnit *right) { int LDepth = left->getDepth(); int RDepth = right->getDepth(); if (LDepth != RDepth) { - DEBUG(dbgs() << " Comparing latency of SU (" << left->NodeNum - << ") depth " << LDepth << " vs SU (" << right->NodeNum - << ") depth " << RDepth << "\n"); + LLVM_DEBUG(dbgs() << " Comparing latency of SU (" << left->NodeNum + << ") depth " << LDepth << " vs SU (" << right->NodeNum + << ") depth " << RDepth << "\n"); return LDepth < RDepth ? 1 : -1; } if (left->Latency != right->Latency) @@ -169,9 +169,9 @@ const SUnit *GCNILPScheduler::pickBest(const SUnit *left, const SUnit *right) if (!DisableSchedCriticalPath) { int spread = (int)left->getDepth() - (int)right->getDepth(); if (std::abs(spread) > MaxReorderWindow) { - DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): " - << left->getDepth() << " != SU(" << right->NodeNum << "): " - << right->getDepth() << "\n"); + LLVM_DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): " + << left->getDepth() << " != SU(" << right->NodeNum + << "): " << right->getDepth() << "\n"); return left->getDepth() < right->getDepth() ? right : left; } } @@ -324,19 +324,18 @@ GCNILPScheduler::schedule(ArrayRef<const SUnit*> BotRoots, if (AvailQueue.empty()) break; - DEBUG( - dbgs() << "\n=== Picking candidate\n" - "Ready queue:"; - for (auto &C : AvailQueue) - dbgs() << ' ' << C.SU->NodeNum; - dbgs() << '\n'; - ); + LLVM_DEBUG(dbgs() << "\n=== Picking candidate\n" + "Ready queue:"; + for (auto &C + : AvailQueue) dbgs() + << ' ' << C.SU->NodeNum; + dbgs() << '\n';); auto C = pickCandidate(); assert(C); AvailQueue.remove(*C); auto SU = C->SU; - DEBUG(dbgs() << "Selected "; SU->dump(&DAG)); + LLVM_DEBUG(dbgs() << "Selected "; SU->dump(&DAG)); advanceToCycle(SU->getHeight()); diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index a0e4f7ff24cb..15366d66bd85 100644 --- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -11,6 +11,7 @@ #include "AMDGPUSubtarget.h" #include "GCNRegPressure.h" #include "GCNSchedStrategy.h" +#include "SIMachineFunctionInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" @@ -19,6 +20,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Config/llvm-config.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -68,14 +70,14 @@ static void printRegion(raw_ostream &OS, auto I = Begin; MaxInstNum = std::max(MaxInstNum, 1u); for (; I != End && MaxInstNum; ++I, --MaxInstNum) { - if (!I->isDebugValue() && LIS) + if (!I->isDebugInstr() && LIS) OS << LIS->getInstructionIndex(*I); OS << '\t' << *I; } if (I != End) { OS << "\t...\n"; I = std::prev(End); - if (!I->isDebugValue() && LIS) + if (!I->isDebugInstr() && LIS) OS << LIS->getInstructionIndex(*I); OS << '\t' << *I; } @@ -106,7 +108,7 @@ static void printLivenessInfo(raw_ostream &OS, LLVM_DUMP_METHOD void GCNIterativeScheduler::printRegions(raw_ostream &OS) const { - const auto &ST = MF.getSubtarget<SISubtarget>(); + const auto &ST = MF.getSubtarget<GCNSubtarget>(); for (const auto R : Regions) { OS << "Region to schedule "; printRegion(OS, R->Begin, R->End, LIS, 1); @@ -130,7 +132,7 @@ LLVM_DUMP_METHOD void GCNIterativeScheduler::printSchedRP(raw_ostream &OS, const GCNRegPressure &Before, const GCNRegPressure &After) const { - const auto &ST = MF.getSubtarget<SISubtarget>(); + const auto &ST = MF.getSubtarget<GCNSubtarget>(); OS << "RP before: "; Before.print(OS, &ST); OS << "RP after: "; @@ -199,8 +201,8 @@ public: void schedule() { assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End); - DEBUG(dbgs() << "\nScheduling "; - printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2)); + LLVM_DEBUG(dbgs() << "\nScheduling "; + printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2)); Sch.BaseClass::schedule(); // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore @@ -310,14 +312,13 @@ void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overriden void GCNIterativeScheduler::schedule() { // overriden // do nothing - DEBUG( - printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS); - if (!Regions.empty() && Regions.back()->Begin == RegionBegin) { - dbgs() << "Max RP: "; - Regions.back()->MaxPressure.print(dbgs(), &MF.getSubtarget<SISubtarget>()); - } - dbgs() << '\n'; - ); + LLVM_DEBUG(printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS); + if (!Regions.empty() && Regions.back()->Begin == RegionBegin) { + dbgs() << "Max RP: "; + Regions.back()->MaxPressure.print( + dbgs(), &MF.getSubtarget<GCNSubtarget>()); + } dbgs() + << '\n';); } void GCNIterativeScheduler::finalizeSchedule() { // overriden @@ -383,10 +384,10 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule, if (MI != &*Top) { BB->remove(MI); BB->insert(Top, MI); - if (!MI->isDebugValue()) + if (!MI->isDebugInstr()) LIS->handleMove(*MI, true); } - if (!MI->isDebugValue()) { + if (!MI->isDebugInstr()) { // Reset read - undef flags and update them later. for (auto &Op : MI->operands()) if (Op.isReg() && Op.isDef()) @@ -417,7 +418,7 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule, #ifndef NDEBUG const auto RegionMaxRP = getRegionPressure(R); - const auto &ST = MF.getSubtarget<SISubtarget>(); + const auto &ST = MF.getSubtarget<GCNSubtarget>(); #endif assert((SchedMaxRP == RegionMaxRP && (MaxRP.empty() || SchedMaxRP == MaxRP)) || (dbgs() << "Max RP mismatch!!!\n" @@ -432,8 +433,8 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule, // Sort recorded regions by pressure - highest at the front void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) { - const auto &ST = MF.getSubtarget<SISubtarget>(); - std::sort(Regions.begin(), Regions.end(), + const auto &ST = MF.getSubtarget<GCNSubtarget>(); + llvm::sort(Regions.begin(), Regions.end(), [&ST, TargetOcc](const Region *R1, const Region *R2) { return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc); }); @@ -450,24 +451,24 @@ void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) { // BestSchedules aren't deleted on fail. unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { // TODO: assert Regions are sorted descending by pressure - const auto &ST = MF.getSubtarget<SISubtarget>(); + const auto &ST = MF.getSubtarget<GCNSubtarget>(); const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); - DEBUG(dbgs() << "Trying to to improve occupancy, target = " << TargetOcc - << ", current = " << Occ << '\n'); + LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc + << ", current = " << Occ << '\n'); auto NewOcc = TargetOcc; for (auto R : Regions) { if (R->MaxPressure.getOccupancy(ST) >= NewOcc) break; - DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3); - printLivenessInfo(dbgs(), R->Begin, R->End, LIS)); + LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3); + printLivenessInfo(dbgs(), R->Begin, R->End, LIS)); BuildDAG DAG(*R, *this); const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this); const auto MaxRP = getSchedulePressure(*R, MinSchedule); - DEBUG(dbgs() << "Occupancy improvement attempt:\n"; - printSchedRP(dbgs(), R->MaxPressure, MaxRP)); + LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n"; + printSchedRP(dbgs(), R->MaxPressure, MaxRP)); NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST)); if (NewOcc <= Occ) @@ -475,15 +476,21 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { setBestSchedule(*R, MinSchedule, MaxRP); } - DEBUG(dbgs() << "New occupancy = " << NewOcc - << ", prev occupancy = " << Occ << '\n'); + LLVM_DEBUG(dbgs() << "New occupancy = " << NewOcc + << ", prev occupancy = " << Occ << '\n'); + if (NewOcc > Occ) { + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + MFI->increaseOccupancy(MF, NewOcc); + } + return std::max(NewOcc, Occ); } void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( bool TryMaximizeOccupancy) { - const auto &ST = MF.getSubtarget<SISubtarget>(); - auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF); + const auto &ST = MF.getSubtarget<GCNSubtarget>(); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + auto TgtOcc = MFI->getMinAllowedOccupancy(); sortRegionsByPressure(TgtOcc); auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); @@ -496,9 +503,11 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( const int NumPasses = Occ < TgtOcc ? 2 : 1; TgtOcc = std::min(Occ, TgtOcc); - DEBUG(dbgs() << "Scheduling using default scheduler, " - "target occupancy = " << TgtOcc << '\n'); + LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, " + "target occupancy = " + << TgtOcc << '\n'); GCNMaxOccupancySchedStrategy LStrgy(Context); + unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy()); for (int I = 0; I < NumPasses; ++I) { // running first pass with TargetOccupancy = 0 mimics previous scheduling @@ -509,30 +518,33 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( Ovr.schedule(); const auto RP = getRegionPressure(*R); - DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); + LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); if (RP.getOccupancy(ST) < TgtOcc) { - DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc); + LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc); if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) { - DEBUG(dbgs() << ", scheduling minimal register\n"); + LLVM_DEBUG(dbgs() << ", scheduling minimal register\n"); scheduleBest(*R); } else { - DEBUG(dbgs() << ", restoring\n"); + LLVM_DEBUG(dbgs() << ", restoring\n"); Ovr.restoreOrder(); assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc); } } + FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST)); } } + MFI->limitOccupancy(FinalOccupancy); } /////////////////////////////////////////////////////////////////////////////// // Minimal Register Strategy void GCNIterativeScheduler::scheduleMinReg(bool force) { - const auto &ST = MF.getSubtarget<SISubtarget>(); - const auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF); + const auto &ST = MF.getSubtarget<GCNSubtarget>(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const auto TgtOcc = MFI->getOccupancy(); sortRegionsByPressure(TgtOcc); auto MaxPressure = Regions.front()->MaxPressure; @@ -544,7 +556,7 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) { const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this); const auto RP = getSchedulePressure(*R, MinSchedule); - DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) { + LLVM_DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) { dbgs() << "\nWarning: Pressure becomes worse after minreg!"; printSchedRP(dbgs(), R->MaxPressure, RP); }); @@ -553,7 +565,7 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) { break; scheduleRegion(*R, MinSchedule, RP); - DEBUG(printSchedResult(dbgs(), R, RP)); + LLVM_DEBUG(printSchedResult(dbgs(), R, RP)); MaxPressure = RP; } @@ -564,9 +576,9 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) { void GCNIterativeScheduler::scheduleILP( bool TryMaximizeOccupancy) { - const auto &ST = MF.getSubtarget<SISubtarget>(); - auto TgtOcc = std::min(ST.getOccupancyWithLocalMemSize(MF), - ST.getWavesPerEU(MF.getFunction()).second); + const auto &ST = MF.getSubtarget<GCNSubtarget>(); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + auto TgtOcc = MFI->getMinAllowedOccupancy(); sortRegionsByPressure(TgtOcc); auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); @@ -575,26 +587,30 @@ void GCNIterativeScheduler::scheduleILP( Occ = tryMaximizeOccupancy(TgtOcc); TgtOcc = std::min(Occ, TgtOcc); - DEBUG(dbgs() << "Scheduling using default scheduler, " - "target occupancy = " << TgtOcc << '\n'); + LLVM_DEBUG(dbgs() << "Scheduling using default scheduler, " + "target occupancy = " + << TgtOcc << '\n'); + unsigned FinalOccupancy = std::min(Occ, MFI->getOccupancy()); for (auto R : Regions) { BuildDAG DAG(*R, *this); const auto ILPSchedule = makeGCNILPScheduler(DAG.getBottomRoots(), *this); const auto RP = getSchedulePressure(*R, ILPSchedule); - DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); + LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); if (RP.getOccupancy(ST) < TgtOcc) { - DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc); + LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc); if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) { - DEBUG(dbgs() << ", scheduling minimal register\n"); + LLVM_DEBUG(dbgs() << ", scheduling minimal register\n"); scheduleBest(*R); } } else { scheduleRegion(*R, ILPSchedule, RP); - DEBUG(printSchedResult(dbgs(), R, RP)); + LLVM_DEBUG(printSchedResult(dbgs(), R, RP)); + FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST)); } } + MFI->limitOccupancy(FinalOccupancy); } diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp index 9904b5f0f4ba..192d534bb9cf 100644 --- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp +++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp @@ -142,35 +142,38 @@ GCNMinRegScheduler::Candidate* GCNMinRegScheduler::pickCandidate() { unsigned Num = RQ.size(); if (Num == 1) break; - DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num << '\n'); + LLVM_DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num + << '\n'); Num = findMax(Num, [=](const Candidate &C) { return C.Priority; }); if (Num == 1) break; - DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among " - << Num << '\n'); + LLVM_DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among " + << Num << '\n'); Num = findMax(Num, [=](const Candidate &C) { auto SU = C.SU; int Res = getNotReadySuccessors(SU); - DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready " - << Res << " successors, metric = " << -Res << '\n'); + LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready " + << Res << " successors, metric = " << -Res << '\n'); return -Res; }); if (Num == 1) break; - DEBUG(dbgs() << "\nSelecting most producing candidate among " - << Num << '\n'); + LLVM_DEBUG(dbgs() << "\nSelecting most producing candidate among " << Num + << '\n'); Num = findMax(Num, [=](const Candidate &C) { auto SU = C.SU; auto Res = getReadySuccessors(SU); - DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready " - << Res << " successors, metric = " << Res << '\n'); + LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready " << Res + << " successors, metric = " << Res << '\n'); return Res; }); if (Num == 1) break; Num = Num ? Num : RQ.size(); - DEBUG(dbgs() << "\nCan't find best candidate, selecting in program order among " - << Num << '\n'); + LLVM_DEBUG( + dbgs() + << "\nCan't find best candidate, selecting in program order among " + << Num << '\n'); Num = findMax(Num, [=](const Candidate &C) { return -(int64_t)C.SU->NodeNum; }); assert(Num == 1); } while (false); @@ -202,17 +205,17 @@ void GCNMinRegScheduler::bumpPredsPriority(const SUnit *SchedSU, int Priority) { Worklist.push_back(P.getSUnit()); } } - DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum - << ")'s non-ready successors of " << Priority - << " priority in ready queue: "); + LLVM_DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum + << ")'s non-ready successors of " << Priority + << " priority in ready queue: "); const auto SetEnd = Set.end(); for (auto &C : RQ) { if (Set.find(C.SU) != SetEnd) { C.Priority = Priority; - DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')'); + LLVM_DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')'); } } - DEBUG(dbgs() << '\n'); + LLVM_DEBUG(dbgs() << '\n'); } void GCNMinRegScheduler::releaseSuccessors(const SUnit* SU, int Priority) { @@ -243,19 +246,19 @@ GCNMinRegScheduler::schedule(ArrayRef<const SUnit*> TopRoots, releaseSuccessors(&DAG.EntrySU, StepNo); while (!RQ.empty()) { - DEBUG( - dbgs() << "\n=== Picking candidate, Step = " << StepNo << "\n" - "Ready queue:"; - for (auto &C : RQ) - dbgs() << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')'; - dbgs() << '\n'; - ); + LLVM_DEBUG(dbgs() << "\n=== Picking candidate, Step = " << StepNo + << "\n" + "Ready queue:"; + for (auto &C + : RQ) dbgs() + << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')'; + dbgs() << '\n';); auto C = pickCandidate(); assert(C); RQ.remove(*C); auto SU = C->SU; - DEBUG(dbgs() << "Selected "; SU->dump(&DAG)); + LLVM_DEBUG(dbgs() << "Selected "; SU->dump(&DAG)); releaseSuccessors(SU, StepNo); Schedule.push_back(SU); diff --git a/lib/Target/AMDGPU/GCNProcessors.td b/lib/Target/AMDGPU/GCNProcessors.td index b2a3f652abd8..d76acfa24f90 100644 --- a/lib/Target/AMDGPU/GCNProcessors.td +++ b/lib/Target/AMDGPU/GCNProcessors.td @@ -93,14 +93,6 @@ def : ProcessorModel<"bonaire", SIQuarterSpeedModel, // GCN GFX8 (Volcanic Islands (VI)). //===----------------------------------------------------------------------===// -def : ProcessorModel<"gfx800", SIQuarterSpeedModel, - [FeatureISAVersion8_0_0] ->; - -def : ProcessorModel<"iceland", SIQuarterSpeedModel, - [FeatureISAVersion8_0_0] ->; - def : ProcessorModel<"gfx801", SIQuarterSpeedModel, [FeatureISAVersion8_0_1] >; @@ -113,6 +105,10 @@ def : ProcessorModel<"gfx802", SIQuarterSpeedModel, [FeatureISAVersion8_0_2] >; +def : ProcessorModel<"iceland", SIQuarterSpeedModel, + [FeatureISAVersion8_0_2] +>; + def : ProcessorModel<"tonga", SIQuarterSpeedModel, [FeatureISAVersion8_0_2] >; @@ -152,3 +148,11 @@ def : ProcessorModel<"gfx900", SIQuarterSpeedModel, def : ProcessorModel<"gfx902", SIQuarterSpeedModel, [FeatureISAVersion9_0_2] >; + +def : ProcessorModel<"gfx904", SIQuarterSpeedModel, + [FeatureISAVersion9_0_4] +>; + +def : ProcessorModel<"gfx906", SIQuarterSpeedModel, + [FeatureISAVersion9_0_6] +>; diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp index 992bb7cceb6f..3d8cacc4f02c 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -19,6 +19,7 @@ #include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/Config/llvm-config.h" #include "llvm/MC/LaneBitmask.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" @@ -131,7 +132,7 @@ void GCNRegPressure::inc(unsigned Reg, } } -bool GCNRegPressure::less(const SISubtarget &ST, +bool GCNRegPressure::less(const GCNSubtarget &ST, const GCNRegPressure& O, unsigned MaxOccupancy) const { const auto SGPROcc = std::min(MaxOccupancy, @@ -177,7 +178,7 @@ bool GCNRegPressure::less(const SISubtarget &ST, #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD -void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const { +void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const { OS << "VGPRs: " << getVGPRNum(); if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')'; OS << ", SGPRs: " << getSGPRNum(); @@ -283,24 +284,33 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI, return LiveRegs; } -void GCNUpwardRPTracker::reset(const MachineInstr &MI, - const LiveRegSet *LiveRegsCopy) { - MRI = &MI.getParent()->getParent()->getRegInfo(); +void GCNRPTracker::reset(const MachineInstr &MI, + const LiveRegSet *LiveRegsCopy, + bool After) { + const MachineFunction &MF = *MI.getMF(); + MRI = &MF.getRegInfo(); if (LiveRegsCopy) { if (&LiveRegs != LiveRegsCopy) LiveRegs = *LiveRegsCopy; } else { - LiveRegs = getLiveRegsAfter(MI, LIS); + LiveRegs = After ? getLiveRegsAfter(MI, LIS) + : getLiveRegsBefore(MI, LIS); } + MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs); } +void GCNUpwardRPTracker::reset(const MachineInstr &MI, + const LiveRegSet *LiveRegsCopy) { + GCNRPTracker::reset(MI, LiveRegsCopy, true); +} + void GCNUpwardRPTracker::recede(const MachineInstr &MI) { assert(MRI && "call reset first"); LastTrackedMI = &MI; - if (MI.isDebugValue()) + if (MI.isDebugInstr()) return; auto const RegUses = collectVirtualRegUses(MI, LIS, *MRI); @@ -348,13 +358,7 @@ bool GCNDownwardRPTracker::reset(const MachineInstr &MI, NextMI = skipDebugInstructionsForward(NextMI, MBBEnd); if (NextMI == MBBEnd) return false; - if (LiveRegsCopy) { - if (&LiveRegs != LiveRegsCopy) - LiveRegs = *LiveRegsCopy; - } else { - LiveRegs = getLiveRegsBefore(*NextMI, LIS); - } - MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs); + GCNRPTracker::reset(*NextMI, LiveRegsCopy, false); return true; } diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h index e418aa0fe911..357d3b7b2334 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.h +++ b/lib/Target/AMDGPU/GCNRegPressure.h @@ -49,7 +49,7 @@ struct GCNRegPressure { unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; } unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; } - unsigned getOccupancy(const SISubtarget &ST) const { + unsigned getOccupancy(const GCNSubtarget &ST) const { return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()), ST.getOccupancyWithNumVGPRs(getVGPRNum())); } @@ -59,11 +59,11 @@ struct GCNRegPressure { LaneBitmask NewMask, const MachineRegisterInfo &MRI); - bool higherOccupancy(const SISubtarget &ST, const GCNRegPressure& O) const { + bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure& O) const { return getOccupancy(ST) > O.getOccupancy(ST); } - bool less(const SISubtarget &ST, const GCNRegPressure& O, + bool less(const GCNSubtarget &ST, const GCNRegPressure& O, unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const; bool operator==(const GCNRegPressure &O) const { @@ -74,7 +74,7 @@ struct GCNRegPressure { return !(*this == O); } - void print(raw_ostream &OS, const SISubtarget *ST = nullptr) const; + void print(raw_ostream &OS, const GCNSubtarget *ST = nullptr) const; void dump() const { print(dbgs()); } private: @@ -106,6 +106,9 @@ protected: GCNRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {} + void reset(const MachineInstr &MI, const LiveRegSet *LiveRegsCopy, + bool After); + public: // live regs for the current state const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; } diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp index d414b899050a..f09b7f6cff22 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -28,18 +28,6 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( const MachineSchedContext *C) : GenericScheduler(C), TargetOccupancy(0), MF(nullptr) { } -static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs, - const MachineFunction &MF) { - - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs), - ST.getOccupancyWithNumVGPRs(VGPRs)); - return std::min(MinRegOccupancy, - ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), - MF.getFunction())); -} - void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) { GenericScheduler::initialize(DAG); @@ -47,7 +35,7 @@ void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) { MF = &DAG->MF; - const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); // FIXME: This is also necessary, because some passes that run after // scheduling and before regalloc increase register pressure. @@ -81,7 +69,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU Cand.AtTop = AtTop; // getDownwardPressure() and getUpwardPressure() make temporary changes to - // the the tracker, so we need to pass those function a non-const copy. + // the tracker, so we need to pass those function a non-const copy. RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); std::vector<unsigned> Pressure; @@ -200,34 +188,30 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); // See if BotCand is still valid (because we previously scheduled from Top). - DEBUG(dbgs() << "Picking from Bot:\n"); + LLVM_DEBUG(dbgs() << "Picking from Bot:\n"); if (!BotCand.isValid() || BotCand.SU->isScheduled || BotCand.Policy != BotPolicy) { BotCand.reset(CandPolicy()); pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand); assert(BotCand.Reason != NoCand && "failed to find the first candidate"); } else { - DEBUG(traceCandidate(BotCand)); + LLVM_DEBUG(traceCandidate(BotCand)); } // Check if the top Q has a better candidate. - DEBUG(dbgs() << "Picking from Top:\n"); + LLVM_DEBUG(dbgs() << "Picking from Top:\n"); if (!TopCand.isValid() || TopCand.SU->isScheduled || TopCand.Policy != TopPolicy) { TopCand.reset(CandPolicy()); pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand); assert(TopCand.Reason != NoCand && "failed to find the first candidate"); } else { - DEBUG(traceCandidate(TopCand)); + LLVM_DEBUG(traceCandidate(TopCand)); } // Pick best from BotCand and TopCand. - DEBUG( - dbgs() << "Top Cand: "; - traceCandidate(TopCand); - dbgs() << "Bot Cand: "; - traceCandidate(BotCand); - ); + LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); + dbgs() << "Bot Cand: "; traceCandidate(BotCand);); SchedCandidate Cand; if (TopCand.Reason == BotCand.Reason) { Cand = BotCand; @@ -256,10 +240,7 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { } } } - DEBUG( - dbgs() << "Picking: "; - traceCandidate(Cand); - ); + LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); IsTopNode = Cand.AtTop; return Cand.SU; @@ -305,20 +286,20 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) { if (SU->isBottomReady()) Bot.removeReady(SU); - DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr()); + LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " + << *SU->getInstr()); return SU; } GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S) : ScheduleDAGMILive(C, std::move(S)), - ST(MF.getSubtarget<SISubtarget>()), + ST(MF.getSubtarget<GCNSubtarget>()), MFI(*MF.getInfo<SIMachineFunctionInfo>()), - StartingOccupancy(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(), - MF.getFunction())), + StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) { - DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); + LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); } void GCNScheduleDAGMILive::schedule() { @@ -338,12 +319,12 @@ void GCNScheduleDAGMILive::schedule() { if (LIS) { PressureBefore = Pressure[RegionIdx]; - DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:"; - GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI); - dbgs() << "Region live-in pressure: "; - llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs()); - dbgs() << "Region register pressure: "; - PressureBefore.print(dbgs())); + LLVM_DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:"; + GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI); + dbgs() << "Region live-in pressure: "; + llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs()); + dbgs() << "Region register pressure: "; + PressureBefore.print(dbgs())); } ScheduleDAGMILive::schedule(); @@ -356,45 +337,54 @@ void GCNScheduleDAGMILive::schedule() { GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; auto PressureAfter = getRealRegPressure(); - DEBUG(dbgs() << "Pressure after scheduling: "; PressureAfter.print(dbgs())); + LLVM_DEBUG(dbgs() << "Pressure after scheduling: "; + PressureAfter.print(dbgs())); if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && PressureAfter.getVGPRNum() <= S.VGPRCriticalLimit) { Pressure[RegionIdx] = PressureAfter; - DEBUG(dbgs() << "Pressure in desired limits, done.\n"); + LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); return; } - unsigned WavesAfter = getMaxWaves(PressureAfter.getSGPRNum(), - PressureAfter.getVGPRNum(), MF); - unsigned WavesBefore = getMaxWaves(PressureBefore.getSGPRNum(), - PressureBefore.getVGPRNum(), MF); - DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore << - ", after " << WavesAfter << ".\n"); + unsigned Occ = MFI.getOccupancy(); + unsigned WavesAfter = std::min(Occ, PressureAfter.getOccupancy(ST)); + unsigned WavesBefore = std::min(Occ, PressureBefore.getOccupancy(ST)); + LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore + << ", after " << WavesAfter << ".\n"); // We could not keep current target occupancy because of the just scheduled // region. Record new occupancy for next scheduling cycle. unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); + // Allow memory bound functions to drop to 4 waves if not limited by an + // attribute. + if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy && + WavesAfter >= MFI.getMinAllowedOccupancy()) { + LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " + << MFI.getMinAllowedOccupancy() << " waves\n"); + NewOccupancy = WavesAfter; + } if (NewOccupancy < MinOccupancy) { MinOccupancy = NewOccupancy; - DEBUG(dbgs() << "Occupancy lowered for the function to " - << MinOccupancy << ".\n"); + MFI.limitOccupancy(MinOccupancy); + LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " + << MinOccupancy << ".\n"); } - if (WavesAfter >= WavesBefore) { + if (WavesAfter >= MinOccupancy) { Pressure[RegionIdx] = PressureAfter; return; } - DEBUG(dbgs() << "Attempting to revert scheduling.\n"); + LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); RegionEnd = RegionBegin; for (MachineInstr *MI : Unsched) { - if (MI->isDebugValue()) + if (MI->isDebugInstr()) continue; if (MI->getIterator() != RegionEnd) { BB->remove(MI); BB->insert(RegionEnd, MI); - if (!MI->isDebugValue()) + if (!MI->isDebugInstr()) LIS->handleMove(*MI, true); } // Reset read-undef flags and update them later. @@ -403,7 +393,7 @@ void GCNScheduleDAGMILive::schedule() { Op.setIsUndef(false); RegisterOperands RegOpers; RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false); - if (!MI->isDebugValue()) { + if (!MI->isDebugInstr()) { if (ShouldTrackLaneMasks) { // Adjust liveness and add missing dead+read-undef flags. SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); @@ -415,7 +405,7 @@ void GCNScheduleDAGMILive::schedule() { } RegionEnd = MI->getIterator(); ++RegionEnd; - DEBUG(dbgs() << "Scheduling " << *MI); + LLVM_DEBUG(dbgs() << "Scheduling " << *MI); } RegionBegin = Unsched.front()->getIterator(); Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); @@ -490,7 +480,7 @@ void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) { void GCNScheduleDAGMILive::finalizeSchedule() { GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; - DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); + LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); LiveIns.resize(Regions.size()); Pressure.resize(Regions.size()); @@ -509,9 +499,10 @@ void GCNScheduleDAGMILive::finalizeSchedule() { if (!LIS || StartingOccupancy <= MinOccupancy) break; - DEBUG(dbgs() - << "Retrying function scheduling with lowest recorded occupancy " - << MinOccupancy << ".\n"); + LLVM_DEBUG( + dbgs() + << "Retrying function scheduling with lowest recorded occupancy " + << MinOccupancy << ".\n"); S.setTargetOccupancy(MinOccupancy); } @@ -537,12 +528,13 @@ void GCNScheduleDAGMILive::finalizeSchedule() { continue; } - DEBUG(dbgs() << "********** MI Scheduling **********\n"); - DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " " - << MBB->getName() << "\n From: " << *begin() << " To: "; - if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; - else dbgs() << "End"; - dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); + LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n"); + LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " " + << MBB->getName() << "\n From: " << *begin() + << " To: "; + if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; + else dbgs() << "End"; + dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); schedule(); diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h index 060d2ca72d93..3ac6af89cb9b 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -21,7 +21,7 @@ namespace llvm { class SIMachineFunctionInfo; class SIRegisterInfo; -class SISubtarget; +class GCNSubtarget; /// This is a minimal scheduler strategy. The main difference between this /// and the GenericScheduler is that GCNSchedStrategy uses different @@ -62,9 +62,9 @@ public: class GCNScheduleDAGMILive : public ScheduleDAGMILive { - const SISubtarget &ST; + const GCNSubtarget &ST; - const SIMachineFunctionInfo &MFI; + SIMachineFunctionInfo &MFI; // Occupancy target at the beginning of function scheduling cycle. unsigned StartingOccupancy; diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index bf57f88bef91..db908368a179 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -217,6 +217,11 @@ void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo, printNamedBit(MI, OpNo, O, "lwe"); } +void AMDGPUInstPrinter::printD16(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + printNamedBit(MI, OpNo, O, "d16"); +} + void AMDGPUInstPrinter::printExpCompr(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -267,6 +272,9 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, case AMDGPU::FLAT_SCR: O << "flat_scratch"; return; + case AMDGPU::XNACK_MASK: + O << "xnack_mask"; + return; case AMDGPU::VCC_LO: O << "vcc_lo"; return; @@ -297,6 +305,12 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, case AMDGPU::FLAT_SCR_HI: O << "flat_scratch_hi"; return; + case AMDGPU::XNACK_MASK_LO: + O << "xnack_mask_lo"; + return; + case AMDGPU::XNACK_MASK_HI: + O << "xnack_mask_hi"; + return; case AMDGPU::FP_REG: case AMDGPU::SP_REG: case AMDGPU::SCRATCH_WAVE_OFFSET_REG: @@ -371,6 +385,16 @@ void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, printOperand(MI, OpNo, STI, O); } +void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { + if (AMDGPU::isSI(STI) || AMDGPU::isCI(STI)) + O << " "; + else + O << "_e32 "; + + printOperand(MI, OpNo, STI, O); +} + void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -486,11 +510,6 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - if (!STI.getFeatureBits()[AMDGPU::FeatureGCN]) { - static_cast<R600InstPrinter*>(this)->printOperand(MI, OpNo, O); - return; - } - if (OpNo >= MI->getNumOperands()) { O << "/*Missing OP" << OpNo << "*/"; return; @@ -612,40 +631,45 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI, void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { + using namespace AMDGPU::DPP; + unsigned Imm = MI->getOperand(OpNo).getImm(); - if (Imm <= 0x0ff) { + if (Imm <= DppCtrl::QUAD_PERM_LAST) { O << " quad_perm:["; O << formatDec(Imm & 0x3) << ','; O << formatDec((Imm & 0xc) >> 2) << ','; O << formatDec((Imm & 0x30) >> 4) << ','; O << formatDec((Imm & 0xc0) >> 6) << ']'; - } else if ((Imm >= 0x101) && (Imm <= 0x10f)) { + } else if ((Imm >= DppCtrl::ROW_SHL_FIRST) && + (Imm <= DppCtrl::ROW_SHL_LAST)) { O << " row_shl:"; printU4ImmDecOperand(MI, OpNo, O); - } else if ((Imm >= 0x111) && (Imm <= 0x11f)) { + } else if ((Imm >= DppCtrl::ROW_SHR_FIRST) && + (Imm <= DppCtrl::ROW_SHR_LAST)) { O << " row_shr:"; printU4ImmDecOperand(MI, OpNo, O); - } else if ((Imm >= 0x121) && (Imm <= 0x12f)) { + } else if ((Imm >= DppCtrl::ROW_ROR_FIRST) && + (Imm <= DppCtrl::ROW_ROR_LAST)) { O << " row_ror:"; printU4ImmDecOperand(MI, OpNo, O); - } else if (Imm == 0x130) { + } else if (Imm == DppCtrl::WAVE_SHL1) { O << " wave_shl:1"; - } else if (Imm == 0x134) { + } else if (Imm == DppCtrl::WAVE_ROL1) { O << " wave_rol:1"; - } else if (Imm == 0x138) { + } else if (Imm == DppCtrl::WAVE_SHR1) { O << " wave_shr:1"; - } else if (Imm == 0x13c) { + } else if (Imm == DppCtrl::WAVE_ROR1) { O << " wave_ror:1"; - } else if (Imm == 0x140) { + } else if (Imm == DppCtrl::ROW_MIRROR) { O << " row_mirror"; - } else if (Imm == 0x141) { + } else if (Imm == DppCtrl::ROW_HALF_MIRROR) { O << " row_half_mirror"; - } else if (Imm == 0x142) { + } else if (Imm == DppCtrl::BCAST15) { O << " row_bcast:15"; - } else if (Imm == 0x143) { + } else if (Imm == DppCtrl::BCAST31) { O << " row_bcast:31"; } else { - llvm_unreachable("Invalid dpp_ctrl value"); + O << " /* Invalid dpp_ctrl value */"; } } @@ -936,11 +960,6 @@ void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - if (!STI.getFeatureBits()[AMDGPU::FeatureGCN]) { - static_cast<R600InstPrinter*>(this)->printMemOperand(MI, OpNo, O); - return; - } - printOperand(MI, OpNo, STI, O); O << ", "; printOperand(MI, OpNo + 1, STI, O); @@ -966,16 +985,6 @@ void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, O << Asm; } -void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast<R600InstPrinter*>(this)->printAbs(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast<R600InstPrinter*>(this)->printClamp(MI, OpNo, O); -} - void AMDGPUInstPrinter::printHigh(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -1002,70 +1011,6 @@ void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo, O << " div:2"; } -void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - static_cast<R600InstPrinter*>(this)->printLiteral(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast<R600InstPrinter*>(this)->printLast(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast<R600InstPrinter*>(this)->printNeg(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast<R600InstPrinter*>(this)->printOMOD(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast<R600InstPrinter*>(this)->printRel(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - static_cast<R600InstPrinter*>(this)->printUpdateExecMask(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - static_cast<R600InstPrinter*>(this)->printUpdatePred(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast<R600InstPrinter*>(this)->printWrite(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - static_cast<R600InstPrinter*>(this)->printBankSwizzle(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast<R600InstPrinter*>(this)->printRSel(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast<R600InstPrinter*>(this)->printCT(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast<R600InstPrinter*>(this)->printKCache(MI, OpNo, O); -} - void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -1254,7 +1199,10 @@ void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo, const unsigned Width = ((SImm16 & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1; O << "hwreg("; - if (ID_SYMBOLIC_FIRST_ <= Id && Id < ID_SYMBOLIC_LAST_) { + unsigned Last = ID_SYMBOLIC_LAST_; + if (AMDGPU::isSI(STI) || AMDGPU::isCI(STI) || AMDGPU::isVI(STI)) + Last = ID_SYMBOLIC_FIRST_GFX9_; + if (ID_SYMBOLIC_FIRST_ <= Id && Id < Last && IdSymbolic[Id]) { O << IdSymbolic[Id]; } else { O << Id; @@ -1267,6 +1215,13 @@ void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo, #include "AMDGPUGenAsmWriter.inc" +void R600InstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, const MCSubtargetInfo &STI) { + O.flush(); + printInstruction(MI, O); + printAnnotation(O, Annot); +} + void R600InstPrinter::printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O) { AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '|'); @@ -1385,7 +1340,7 @@ void R600InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, if (Op.isReg()) { switch (Op.getReg()) { // This is the default predicate state, so we don't need to print it. - case AMDGPU::PRED_SEL_OFF: + case R600::PRED_SEL_OFF: break; default: @@ -1461,3 +1416,5 @@ void R600InstPrinter::printWrite(const MCInst *MI, unsigned OpNo, O << " (MASKED)"; } } + +#include "R600GenAsmWriter.inc" diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h index d97f04689e18..11a496a38b2c 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -84,6 +84,8 @@ private: raw_ostream &O); void printLWE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printD16(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printExpCompr(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printExpVM(const MCInst *MI, unsigned OpNo, @@ -96,6 +98,8 @@ private: void printRegOperand(unsigned RegNo, raw_ostream &O); void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI, @@ -214,13 +218,16 @@ protected: raw_ostream &O); }; -// FIXME: R600 specific parts of AMDGPUInstrPrinter should be moved here, and -// MCTargetDesc should be using R600InstPrinter for the R600 target. -class R600InstPrinter : public AMDGPUInstPrinter { +class R600InstPrinter : public MCInstPrinter { public: R600InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, const MCRegisterInfo &MRI) - : AMDGPUInstPrinter(MAI, MII, MRI) {} + : MCInstPrinter(MAI, MII, MRI) {} + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O); diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 778d4a7ba9d0..abc88c02adca 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -26,14 +26,14 @@ namespace { class AMDGPUAsmBackend : public MCAsmBackend { public: - AMDGPUAsmBackend(const Target &T) - : MCAsmBackend() {} + AMDGPUAsmBackend(const Target &T) : MCAsmBackend(support::little) {} unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; }; void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, - uint64_t Value, bool IsResolved) const override; + uint64_t Value, bool IsResolved, + const MCSubtargetInfo *STI) const override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override { @@ -43,10 +43,13 @@ public: MCInst &Res) const override { llvm_unreachable("Not implemented"); } - bool mayNeedRelaxation(const MCInst &Inst) const override { return false; } + bool mayNeedRelaxation(const MCInst &Inst, + const MCSubtargetInfo &STI) const override { + return false; + } unsigned getMinimumNopSize() const override; - bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; + bool writeNopData(raw_ostream &OS, uint64_t Count) const override; const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; }; @@ -103,7 +106,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, uint64_t Value, - bool IsResolved) const { + bool IsResolved, + const MCSubtargetInfo *STI) const { Value = adjustFixupValue(Fixup, Value, &Asm.getContext()); if (!Value) return; // Doesn't change encoding. @@ -140,11 +144,11 @@ unsigned AMDGPUAsmBackend::getMinimumNopSize() const { return 4; } -bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { +bool AMDGPUAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const { // If the count is not 4-byte aligned, we must be writing data into the text // section (otherwise we have unaligned instructions, and thus have far // bigger problems), so just write zeros instead. - OW->WriteZeros(Count % 4); + OS.write_zeros(Count % 4); // We are properly aligned, so write NOPs as requested. Count /= 4; @@ -154,7 +158,7 @@ bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { const uint32_t Encoded_S_NOP_0 = 0xbf800000; for (uint64_t I = 0; I != Count; ++I) - OW->write32(Encoded_S_NOP_0); + support::endian::write<uint32_t>(OS, Encoded_S_NOP_0, Endian); return true; } @@ -189,18 +193,18 @@ public: } } - std::unique_ptr<MCObjectWriter> - createObjectWriter(raw_pwrite_stream &OS) const override { - return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend, OS); + std::unique_ptr<MCObjectTargetWriter> + createObjectTargetWriter() const override { + return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend); } }; } // end anonymous namespace MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, + const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, const MCTargetOptions &Options) { // Use 64-bit ELF for amdgcn - return new ELFAMDGPUAsmBackend(T, TT); + return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple()); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index e443b0729606..07bef9103c0d 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -66,6 +66,8 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, return ELF::R_AMDGPU_REL32_LO; case MCSymbolRefExpr::VK_AMDGPU_REL32_HI: return ELF::R_AMDGPU_REL32_HI; + case MCSymbolRefExpr::VK_AMDGPU_REL64: + return ELF::R_AMDGPU_REL64; } switch (Fixup.getKind()) { @@ -82,11 +84,9 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, llvm_unreachable("unhandled relocation type"); } -std::unique_ptr<MCObjectWriter> +std::unique_ptr<MCObjectTargetWriter> llvm::createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, - bool HasRelocationAddend, - raw_pwrite_stream &OS) { - auto MOTW = llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI, - HasRelocationAddend); - return createELFObjectWriter(std::move(MOTW), OS, true); + bool HasRelocationAddend) { + return llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI, + HasRelocationAddend); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp index 1497edc7a054..c627a08e7463 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp @@ -12,37 +12,28 @@ #include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCObjectWriter.h" using namespace llvm; -AMDGPUELFStreamer::AMDGPUELFStreamer(const Triple &T, MCContext &Context, - std::unique_ptr<MCAsmBackend> MAB, - raw_pwrite_stream &OS, - std::unique_ptr<MCCodeEmitter> Emitter) - : MCELFStreamer(Context, std::move(MAB), OS, std::move(Emitter)) { - unsigned Arch = ELF::EF_AMDGPU_ARCH_NONE; - switch (T.getArch()) { - case Triple::r600: - Arch = ELF::EF_AMDGPU_ARCH_R600; - break; - case Triple::amdgcn: - Arch = ELF::EF_AMDGPU_ARCH_GCN; - break; - default: - break; - } +namespace { + +class AMDGPUELFStreamer : public MCELFStreamer { +public: + AMDGPUELFStreamer(const Triple &T, MCContext &Context, + std::unique_ptr<MCAsmBackend> MAB, + std::unique_ptr<MCObjectWriter> OW, + std::unique_ptr<MCCodeEmitter> Emitter) + : MCELFStreamer(Context, std::move(MAB), std::move(OW), + std::move(Emitter)) {} +}; - MCAssembler &MCA = getAssembler(); - unsigned EFlags = MCA.getELFHeaderEFlags(); - EFlags &= ~ELF::EF_AMDGPU_ARCH; - EFlags |= Arch; - MCA.setELFHeaderEFlags(EFlags); } MCELFStreamer *llvm::createAMDGPUELFStreamer( const Triple &T, MCContext &Context, std::unique_ptr<MCAsmBackend> MAB, - raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter, + std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter, bool RelaxAll) { - return new AMDGPUELFStreamer(T, Context, std::move(MAB), OS, + return new AMDGPUELFStreamer(T, Context, std::move(MAB), std::move(OW), std::move(Emitter)); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h index 0cc0a4c5cd5d..41e9063a759e 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h @@ -23,16 +23,9 @@ class MCCodeEmitter; class MCContext; class MCSubtargetInfo; -class AMDGPUELFStreamer : public MCELFStreamer { -public: - AMDGPUELFStreamer(const Triple &T, MCContext &Context, - std::unique_ptr<MCAsmBackend> MAB, raw_pwrite_stream &OS, - std::unique_ptr<MCCodeEmitter> Emitter); -}; - MCELFStreamer *createAMDGPUELFStreamer(const Triple &T, MCContext &Context, std::unique_ptr<MCAsmBackend> MAB, - raw_pwrite_stream &OS, + std::unique_ptr<MCObjectWriter> OW, std::unique_ptr<MCCodeEmitter> Emitter, bool RelaxAll); } // namespace llvm. diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index 521b3b39bba2..cae7a7a6c7e7 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief CodeEmitter interface for R600 and SI codegen. +/// CodeEmitter interface for R600 and SI codegen. // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h index 1b062064ace1..dcc10a032afe 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief CodeEmitter interface for R600 and SI codegen. +/// CodeEmitter interface for R600 and SI codegen. // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 2b321c04fb30..c579c7d60e16 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief This file provides AMDGPU specific target descriptions. +/// This file provides AMDGPU specific target descriptions. // //===----------------------------------------------------------------------===// @@ -22,6 +22,7 @@ #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" @@ -37,9 +38,17 @@ using namespace llvm; #define GET_SUBTARGETINFO_MC_DESC #include "AMDGPUGenSubtargetInfo.inc" +#define NoSchedModel NoSchedModelR600 +#define GET_SUBTARGETINFO_MC_DESC +#include "R600GenSubtargetInfo.inc" +#undef NoSchedModelR600 + #define GET_REGINFO_MC_DESC #include "AMDGPUGenRegisterInfo.inc" +#define GET_REGINFO_MC_DESC +#include "R600GenRegisterInfo.inc" + static MCInstrInfo *createAMDGPUMCInstrInfo() { MCInstrInfo *X = new MCInstrInfo(); InitAMDGPUMCInstrInfo(X); @@ -48,12 +57,17 @@ static MCInstrInfo *createAMDGPUMCInstrInfo() { static MCRegisterInfo *createAMDGPUMCRegisterInfo(const Triple &TT) { MCRegisterInfo *X = new MCRegisterInfo(); - InitAMDGPUMCRegisterInfo(X, 0); + if (TT.getArch() == Triple::r600) + InitR600MCRegisterInfo(X, 0); + else + InitAMDGPUMCRegisterInfo(X, 0); return X; } static MCSubtargetInfo * createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { + if (TT.getArch() == Triple::r600) + return createR600MCSubtargetInfoImpl(TT, CPU, FS); return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS); } @@ -62,8 +76,10 @@ static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T, const MCAsmInfo &MAI, const MCInstrInfo &MII, const MCRegisterInfo &MRI) { - return T.getArch() == Triple::r600 ? new R600InstPrinter(MAI, MII, MRI) : - new AMDGPUInstPrinter(MAI, MII, MRI); + if (T.getArch() == Triple::r600) + return new R600InstPrinter(MAI, MII, MRI); + else + return new AMDGPUInstPrinter(MAI, MII, MRI); } static MCTargetStreamer *createAMDGPUAsmTargetStreamer(MCStreamer &S, @@ -76,23 +92,25 @@ static MCTargetStreamer *createAMDGPUAsmTargetStreamer(MCStreamer &S, static MCTargetStreamer * createAMDGPUObjectTargetStreamer( MCStreamer &S, const MCSubtargetInfo &STI) { - return new AMDGPUTargetELFStreamer(S); + return new AMDGPUTargetELFStreamer(S, STI); } static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context, std::unique_ptr<MCAsmBackend> &&MAB, - raw_pwrite_stream &OS, + std::unique_ptr<MCObjectWriter> &&OW, std::unique_ptr<MCCodeEmitter> &&Emitter, bool RelaxAll) { - return createAMDGPUELFStreamer(T, Context, std::move(MAB), OS, + return createAMDGPUELFStreamer(T, Context, std::move(MAB), std::move(OW), std::move(Emitter), RelaxAll); } extern "C" void LLVMInitializeAMDGPUTargetMC() { + + TargetRegistry::RegisterMCInstrInfo(getTheGCNTarget(), createAMDGPUMCInstrInfo); + TargetRegistry::RegisterMCInstrInfo(getTheAMDGPUTarget(), createR600MCInstrInfo); for (Target *T : {&getTheAMDGPUTarget(), &getTheGCNTarget()}) { RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T); - TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo); TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo); TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo); TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter); @@ -103,6 +121,8 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() { // R600 specific registration TargetRegistry::RegisterMCCodeEmitter(getTheAMDGPUTarget(), createR600MCCodeEmitter); + TargetRegistry::RegisterObjectTargetStreamer( + getTheAMDGPUTarget(), createAMDGPUObjectTargetStreamer); // GCN specific registration TargetRegistry::RegisterMCCodeEmitter(getTheGCNTarget(), diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index 0b3563303ad0..f3628d96d6e9 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Provides AMDGPU specific target descriptions. +/// Provides AMDGPU specific target descriptions. // //===----------------------------------------------------------------------===// // @@ -25,7 +25,7 @@ class MCAsmBackend; class MCCodeEmitter; class MCContext; class MCInstrInfo; -class MCObjectWriter; +class MCObjectTargetWriter; class MCRegisterInfo; class MCSubtargetInfo; class MCTargetOptions; @@ -40,24 +40,30 @@ Target &getTheGCNTarget(); MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); +MCInstrInfo *createR600MCInstrInfo(); MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); -MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI, - const Triple &TT, StringRef CPU, +MCAsmBackend *createAMDGPUAsmBackend(const Target &T, + const MCSubtargetInfo &STI, + const MCRegisterInfo &MRI, const MCTargetOptions &Options); -std::unique_ptr<MCObjectWriter> +std::unique_ptr<MCObjectTargetWriter> createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, - bool HasRelocationAddend, raw_pwrite_stream &OS); + bool HasRelocationAddend); } // End llvm namespace #define GET_REGINFO_ENUM #include "AMDGPUGenRegisterInfo.inc" #undef GET_REGINFO_ENUM +#define GET_REGINFO_ENUM +#include "R600GenRegisterInfo.inc" +#undef GET_REGINFO_ENUM + #define GET_INSTRINFO_ENUM #define GET_INSTRINFO_OPERAND_ENUM #define GET_INSTRINFO_SCHED_ENUM @@ -66,9 +72,20 @@ createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, #undef GET_INSTRINFO_OPERAND_ENUM #undef GET_INSTRINFO_ENUM +#define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_OPERAND_ENUM +#define GET_INSTRINFO_SCHED_ENUM +#include "R600GenInstrInfo.inc" +#undef GET_INSTRINFO_SCHED_ENUM +#undef GET_INSTRINFO_OPERAND_ENUM +#undef GET_INSTRINFO_ENUM #define GET_SUBTARGETINFO_ENUM #include "AMDGPUGenSubtargetInfo.inc" #undef GET_SUBTARGETINFO_ENUM +#define GET_SUBTARGETINFO_ENUM +#include "R600GenSubtargetInfo.inc" +#undef GET_SUBTARGETINFO_ENUM + #endif diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index d897956daccf..6a41e3f650bc 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -39,6 +39,84 @@ using namespace llvm::AMDGPU; // AMDGPUTargetStreamer //===----------------------------------------------------------------------===// +static const struct { + const char *Name; + unsigned Mach; +} MachTable[] = { + // Radeon HD 2000/3000 Series (R600). + { "r600", ELF::EF_AMDGPU_MACH_R600_R600 }, + { "r630", ELF::EF_AMDGPU_MACH_R600_R630 }, + { "rs880", ELF::EF_AMDGPU_MACH_R600_RS880 }, + { "rv670", ELF::EF_AMDGPU_MACH_R600_RV670 }, + // Radeon HD 4000 Series (R700). + { "rv710", ELF::EF_AMDGPU_MACH_R600_RV710 }, + { "rv730", ELF::EF_AMDGPU_MACH_R600_RV730 }, + { "rv770", ELF::EF_AMDGPU_MACH_R600_RV770 }, + // Radeon HD 5000 Series (Evergreen). + { "cedar", ELF::EF_AMDGPU_MACH_R600_CEDAR }, + { "cypress", ELF::EF_AMDGPU_MACH_R600_CYPRESS }, + { "juniper", ELF::EF_AMDGPU_MACH_R600_JUNIPER }, + { "redwood", ELF::EF_AMDGPU_MACH_R600_REDWOOD }, + { "sumo", ELF::EF_AMDGPU_MACH_R600_SUMO }, + // Radeon HD 6000 Series (Northern Islands). + { "barts", ELF::EF_AMDGPU_MACH_R600_BARTS }, + { "caicos", ELF::EF_AMDGPU_MACH_R600_CAICOS }, + { "cayman", ELF::EF_AMDGPU_MACH_R600_CAYMAN }, + { "turks", ELF::EF_AMDGPU_MACH_R600_TURKS }, + // AMDGCN GFX6. + { "gfx600", ELF::EF_AMDGPU_MACH_AMDGCN_GFX600 }, + { "tahiti", ELF::EF_AMDGPU_MACH_AMDGCN_GFX600 }, + { "gfx601", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 }, + { "hainan", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 }, + { "oland", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 }, + { "pitcairn", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 }, + { "verde", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 }, + // AMDGCN GFX7. + { "gfx700", ELF::EF_AMDGPU_MACH_AMDGCN_GFX700 }, + { "kaveri", ELF::EF_AMDGPU_MACH_AMDGCN_GFX700 }, + { "gfx701", ELF::EF_AMDGPU_MACH_AMDGCN_GFX701 }, + { "hawaii", ELF::EF_AMDGPU_MACH_AMDGCN_GFX701 }, + { "gfx702", ELF::EF_AMDGPU_MACH_AMDGCN_GFX702 }, + { "gfx703", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 }, + { "kabini", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 }, + { "mullins", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 }, + { "gfx704", ELF::EF_AMDGPU_MACH_AMDGCN_GFX704 }, + { "bonaire", ELF::EF_AMDGPU_MACH_AMDGCN_GFX704 }, + // AMDGCN GFX8. + { "gfx801", ELF::EF_AMDGPU_MACH_AMDGCN_GFX801 }, + { "carrizo", ELF::EF_AMDGPU_MACH_AMDGCN_GFX801 }, + { "gfx802", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 }, + { "iceland", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 }, + { "tonga", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 }, + { "gfx803", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 }, + { "fiji", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 }, + { "polaris10", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 }, + { "polaris11", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 }, + { "gfx810", ELF::EF_AMDGPU_MACH_AMDGCN_GFX810 }, + { "stoney", ELF::EF_AMDGPU_MACH_AMDGCN_GFX810 }, + // AMDGCN GFX9. + { "gfx900", ELF::EF_AMDGPU_MACH_AMDGCN_GFX900 }, + { "gfx902", ELF::EF_AMDGPU_MACH_AMDGCN_GFX902 }, + { "gfx904", ELF::EF_AMDGPU_MACH_AMDGCN_GFX904 }, + { "gfx906", ELF::EF_AMDGPU_MACH_AMDGCN_GFX906 }, + // Not specified processor. + { nullptr, ELF::EF_AMDGPU_MACH_NONE } +}; + +unsigned AMDGPUTargetStreamer::getMACH(StringRef GPU) const { + auto Entry = MachTable; + for (; Entry->Name && GPU != Entry->Name; ++Entry) + ; + return Entry->Mach; +} + +const char *AMDGPUTargetStreamer::getMachName(unsigned Mach) { + auto Entry = MachTable; + for (; Entry->Name && Mach != Entry->Mach; ++Entry) + ; + return Entry->Name; +} + bool AMDGPUTargetStreamer::EmitHSAMetadata(StringRef HSAMetadataString) { HSAMD::Metadata HSAMetadata; if (HSAMD::fromString(HSAMetadataString, HSAMetadata)) @@ -55,9 +133,12 @@ AMDGPUTargetAsmStreamer::AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS) : AMDGPUTargetStreamer(S), OS(OS) { } -void -AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major, - uint32_t Minor) { +void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) { + OS << "\t.amdgcn_target \"" << Target << "\"\n"; +} + +void AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion( + uint32_t Major, uint32_t Minor) { OS << "\t.hsa_code_object_version " << Twine(Major) << "," << Twine(Minor) << '\n'; } @@ -118,12 +199,157 @@ bool AMDGPUTargetAsmStreamer::EmitPALMetadata( return true; } +void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( + const MCSubtargetInfo &STI, StringRef KernelName, + const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR, + bool ReserveVCC, bool ReserveFlatScr, bool ReserveXNACK) { + amdhsa::kernel_descriptor_t DefaultKD = getDefaultAmdhsaKernelDescriptor(); + + IsaInfo::IsaVersion IVersion = IsaInfo::getIsaVersion(STI.getFeatureBits()); + + OS << "\t.amdhsa_kernel " << KernelName << '\n'; + +#define PRINT_IF_NOT_DEFAULT(STREAM, DIRECTIVE, KERNEL_DESC, \ + DEFAULT_KERNEL_DESC, MEMBER_NAME, FIELD_NAME) \ + if (AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) != \ + AMDHSA_BITS_GET(DEFAULT_KERNEL_DESC.MEMBER_NAME, FIELD_NAME)) \ + STREAM << "\t\t" << DIRECTIVE << " " \ + << AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) << '\n'; + + if (KD.group_segment_fixed_size != DefaultKD.group_segment_fixed_size) + OS << "\t\t.amdhsa_group_segment_fixed_size " << KD.group_segment_fixed_size + << '\n'; + if (KD.private_segment_fixed_size != DefaultKD.private_segment_fixed_size) + OS << "\t\t.amdhsa_private_segment_fixed_size " + << KD.private_segment_fixed_size << '\n'; + + PRINT_IF_NOT_DEFAULT( + OS, ".amdhsa_user_sgpr_private_segment_buffer", KD, DefaultKD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER); + PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD, DefaultKD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR); + PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_queue_ptr", KD, DefaultKD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR); + PRINT_IF_NOT_DEFAULT( + OS, ".amdhsa_user_sgpr_kernarg_segment_ptr", KD, DefaultKD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR); + PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_dispatch_id", KD, DefaultKD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID); + PRINT_IF_NOT_DEFAULT( + OS, ".amdhsa_user_sgpr_flat_scratch_init", KD, DefaultKD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); + PRINT_IF_NOT_DEFAULT( + OS, ".amdhsa_user_sgpr_private_segment_size", KD, DefaultKD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); + PRINT_IF_NOT_DEFAULT( + OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD, DefaultKD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET); + PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD, DefaultKD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X); + PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_y", KD, DefaultKD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y); + PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_z", KD, DefaultKD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z); + PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_info", KD, DefaultKD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO); + PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_vgpr_workitem_id", KD, DefaultKD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID); + + // These directives are required. + OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n'; + OS << "\t\t.amdhsa_next_free_sgpr " << NextSGPR << '\n'; + + if (!ReserveVCC) + OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n'; + if (IVersion.Major >= 7 && !ReserveFlatScr) + OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n'; + if (IVersion.Major >= 8 && ReserveXNACK != hasXNACK(STI)) + OS << "\t\t.amdhsa_reserve_xnack_mask " << ReserveXNACK << '\n'; + + PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_round_mode_32", KD, DefaultKD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32); + PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_round_mode_16_64", KD, DefaultKD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64); + PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_denorm_mode_32", KD, DefaultKD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32); + PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_denorm_mode_16_64", KD, DefaultKD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64); + PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_dx10_clamp", KD, DefaultKD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP); + PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_ieee_mode", KD, DefaultKD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE); + if (IVersion.Major >= 9) + PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_fp16_overflow", KD, DefaultKD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL); + PRINT_IF_NOT_DEFAULT( + OS, ".amdhsa_exception_fp_ieee_invalid_op", KD, DefaultKD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION); + PRINT_IF_NOT_DEFAULT( + OS, ".amdhsa_exception_fp_denorm_src", KD, DefaultKD, compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE); + PRINT_IF_NOT_DEFAULT( + OS, ".amdhsa_exception_fp_ieee_div_zero", KD, DefaultKD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO); + PRINT_IF_NOT_DEFAULT( + OS, ".amdhsa_exception_fp_ieee_overflow", KD, DefaultKD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW); + PRINT_IF_NOT_DEFAULT( + OS, ".amdhsa_exception_fp_ieee_underflow", KD, DefaultKD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW); + PRINT_IF_NOT_DEFAULT( + OS, ".amdhsa_exception_fp_ieee_inexact", KD, DefaultKD, compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT); + PRINT_IF_NOT_DEFAULT( + OS, ".amdhsa_exception_int_div_zero", KD, DefaultKD, compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO); +#undef PRINT_IF_NOT_DEFAULT + + OS << "\t.end_amdhsa_kernel\n"; +} + //===----------------------------------------------------------------------===// // AMDGPUTargetELFStreamer //===----------------------------------------------------------------------===// -AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(MCStreamer &S) - : AMDGPUTargetStreamer(S), Streamer(S) {} +AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer( + MCStreamer &S, const MCSubtargetInfo &STI) + : AMDGPUTargetStreamer(S), Streamer(S) { + MCAssembler &MCA = getStreamer().getAssembler(); + unsigned EFlags = MCA.getELFHeaderEFlags(); + + EFlags &= ~ELF::EF_AMDGPU_MACH; + EFlags |= getMACH(STI.getCPU()); + + EFlags &= ~ELF::EF_AMDGPU_XNACK; + if (AMDGPU::hasXNACK(STI)) + EFlags |= ELF::EF_AMDGPU_XNACK; + + MCA.setELFHeaderEFlags(EFlags); +} MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() { return static_cast<MCELFStreamer &>(Streamer); @@ -150,9 +376,10 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUNote( S.PopSection(); } -void -AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major, - uint32_t Minor) { +void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {} + +void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion( + uint32_t Major, uint32_t Minor) { EmitAMDGPUNote( MCConstantExpr::create(8, getContext()), @@ -207,7 +434,7 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) { MCSymbolELF *Symbol = cast<MCSymbolELF>( getStreamer().getContext().getOrCreateSymbol(SymbolName)); - Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL); + Symbol->setType(Type); } bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) { @@ -271,3 +498,46 @@ bool AMDGPUTargetELFStreamer::EmitPALMetadata( ); return true; } + +void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor( + const MCSubtargetInfo &STI, StringRef KernelName, + const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, + uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, + bool ReserveXNACK) { + auto &Streamer = getStreamer(); + auto &Context = Streamer.getContext(); + + MCSymbolELF *KernelDescriptorSymbol = cast<MCSymbolELF>( + Context.getOrCreateSymbol(Twine(KernelName) + Twine(".kd"))); + KernelDescriptorSymbol->setBinding(ELF::STB_GLOBAL); + KernelDescriptorSymbol->setType(ELF::STT_OBJECT); + KernelDescriptorSymbol->setSize( + MCConstantExpr::create(sizeof(KernelDescriptor), Context)); + + MCSymbolELF *KernelCodeSymbol = cast<MCSymbolELF>( + Context.getOrCreateSymbol(Twine(KernelName))); + KernelCodeSymbol->setBinding(ELF::STB_LOCAL); + + Streamer.EmitLabel(KernelDescriptorSymbol); + Streamer.EmitBytes(StringRef( + (const char*)&(KernelDescriptor), + offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset))); + // FIXME: Remove the use of VK_AMDGPU_REL64 in the expression below. The + // expression being created is: + // (start of kernel code) - (start of kernel descriptor) + // It implies R_AMDGPU_REL64, but ends up being R_AMDGPU_ABS64. + Streamer.EmitValue(MCBinaryExpr::createSub( + MCSymbolRefExpr::create( + KernelCodeSymbol, MCSymbolRefExpr::VK_AMDGPU_REL64, Context), + MCSymbolRefExpr::create( + KernelDescriptorSymbol, MCSymbolRefExpr::VK_None, Context), + Context), + sizeof(KernelDescriptor.kernel_code_entry_byte_offset)); + Streamer.EmitBytes(StringRef( + (const char*)&(KernelDescriptor) + + offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset) + + sizeof(KernelDescriptor.kernel_code_entry_byte_offset), + sizeof(KernelDescriptor) - + offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset) - + sizeof(KernelDescriptor.kernel_code_entry_byte_offset))); +} diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index 0919b754480d..472da1b73593 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -14,6 +14,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/AMDGPUMetadata.h" +#include "llvm/Support/AMDHSAKernelDescriptor.h" namespace llvm { #include "AMDGPUPTNote.h" @@ -30,9 +31,17 @@ class AMDGPUTargetStreamer : public MCTargetStreamer { protected: MCContext &getContext() const { return Streamer.getContext(); } + /// \returns Equivalent EF_AMDGPU_MACH_* value for given \p GPU name. + unsigned getMACH(StringRef GPU) const; + public: + /// \returns Equivalent GPU name for an EF_AMDGPU_MACH_* value. + static const char *getMachName(unsigned Mach); + AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {} + virtual void EmitDirectiveAMDGCNTarget(StringRef Target) = 0; + virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major, uint32_t Minor) = 0; @@ -56,12 +65,21 @@ public: /// \returns True on success, false on failure. virtual bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) = 0; + + virtual void EmitAmdhsaKernelDescriptor( + const MCSubtargetInfo &STI, StringRef KernelName, + const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, + uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, + bool ReserveXNACK) = 0; }; class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer { formatted_raw_ostream &OS; public: AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS); + + void EmitDirectiveAMDGCNTarget(StringRef Target) override; + void EmitDirectiveHSACodeObjectVersion(uint32_t Major, uint32_t Minor) override; @@ -81,6 +99,12 @@ public: /// \returns True on success, false on failure. bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override; + + void EmitAmdhsaKernelDescriptor( + const MCSubtargetInfo &STI, StringRef KernelName, + const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, + uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, + bool ReserveXNACK) override; }; class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer { @@ -90,10 +114,12 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer { function_ref<void(MCELFStreamer &)> EmitDesc); public: - AMDGPUTargetELFStreamer(MCStreamer &S); + AMDGPUTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI); MCELFStreamer &getStreamer(); + void EmitDirectiveAMDGCNTarget(StringRef Target) override; + void EmitDirectiveHSACodeObjectVersion(uint32_t Major, uint32_t Minor) override; @@ -113,6 +139,12 @@ public: /// \returns True on success, false on failure. bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override; + + void EmitAmdhsaKernelDescriptor( + const MCSubtargetInfo &STI, StringRef KernelName, + const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, + uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, + bool ReserveXNACK) override; }; } diff --git a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt index f9cb4678dc51..2d201bbbd7b8 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt +++ b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt @@ -2,11 +2,11 @@ add_llvm_library(LLVMAMDGPUDesc AMDGPUAsmBackend.cpp AMDGPUELFObjectWriter.cpp AMDGPUELFStreamer.cpp - AMDGPUHSAMetadataStreamer.cpp AMDGPUMCAsmInfo.cpp AMDGPUMCCodeEmitter.cpp AMDGPUMCTargetDesc.cpp AMDGPUTargetStreamer.cpp R600MCCodeEmitter.cpp + R600MCTargetDesc.cpp SIMCCodeEmitter.cpp ) diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index eab90e1d344c..28d4bc1829e2 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -9,13 +9,12 @@ // /// \file /// -/// \brief The R600 code emitter produces machine code that can be executed +/// The R600 code emitter produces machine code that can be executed /// directly on the GPU device. // //===----------------------------------------------------------------------===// #include "MCTargetDesc/AMDGPUFixupKinds.h" -#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "R600Defines.h" #include "llvm/MC/MCCodeEmitter.h" @@ -36,30 +35,40 @@ using namespace llvm; namespace { -class R600MCCodeEmitter : public AMDGPUMCCodeEmitter { +class R600MCCodeEmitter : public MCCodeEmitter { const MCRegisterInfo &MRI; + const MCInstrInfo &MCII; public: R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri) - : AMDGPUMCCodeEmitter(mcii), MRI(mri) {} + : MRI(mri), MCII(mcii) {} R600MCCodeEmitter(const R600MCCodeEmitter &) = delete; R600MCCodeEmitter &operator=(const R600MCCodeEmitter &) = delete; - /// \brief Encode the instruction and write it to the OS. + /// Encode the instruction and write it to the OS. void encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const override; + const MCSubtargetInfo &STI) const; /// \returns the encoding for an MCOperand. uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const override; + const MCSubtargetInfo &STI) const; private: + void Emit(uint32_t value, raw_ostream &OS) const; void Emit(uint64_t value, raw_ostream &OS) const; unsigned getHWReg(unsigned regNo) const; + + uint64_t getBinaryCodeForInstr(const MCInst &MI, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + uint64_t computeAvailableFeatures(const FeatureBitset &FB) const; + void verifyInstructionPredicates(const MCInst &MI, + uint64_t AvailableFeatures) const; + }; } // end anonymous namespace @@ -94,16 +103,16 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, computeAvailableFeatures(STI.getFeatureBits())); const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); - if (MI.getOpcode() == AMDGPU::RETURN || - MI.getOpcode() == AMDGPU::FETCH_CLAUSE || - MI.getOpcode() == AMDGPU::ALU_CLAUSE || - MI.getOpcode() == AMDGPU::BUNDLE || - MI.getOpcode() == AMDGPU::KILL) { + if (MI.getOpcode() == R600::RETURN || + MI.getOpcode() == R600::FETCH_CLAUSE || + MI.getOpcode() == R600::ALU_CLAUSE || + MI.getOpcode() == R600::BUNDLE || + MI.getOpcode() == R600::KILL) { return; } else if (IS_VTX(Desc)) { uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI); uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset - if (!(STI.getFeatureBits()[AMDGPU::FeatureCaymanISA])) { + if (!(STI.getFeatureBits()[R600::FeatureCaymanISA])) { InstWord2 |= 1 << 19; // Mega-Fetch bit } @@ -136,7 +145,7 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, Emit((uint32_t) 0, OS); } else { uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI); - if ((STI.getFeatureBits()[AMDGPU::FeatureR600ALUInst]) && + if ((STI.getFeatureBits()[R600::FeatureR600ALUInst]) && ((Desc.TSFlags & R600_InstFlag::OP1) || Desc.TSFlags & R600_InstFlag::OP2)) { uint64_t ISAOpCode = Inst & (0x3FFULL << 39); @@ -148,11 +157,11 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, } void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const { - support::endian::Writer<support::little>(OS).write(Value); + support::endian::write(OS, Value, support::little); } void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const { - support::endian::Writer<support::little>(OS).write(Value); + support::endian::write(OS, Value, support::little); } unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const { @@ -186,4 +195,4 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, } #define ENABLE_INSTR_PREDICATE_VERIFIER -#include "AMDGPUGenMCCodeEmitter.inc" +#include "R600GenMCCodeEmitter.inc" diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp new file mode 100644 index 000000000000..1c99a708e5ac --- /dev/null +++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp @@ -0,0 +1,27 @@ +//===-- R600MCTargetDesc.cpp - R600 Target Descriptions -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This file provides R600 specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCTargetDesc.h" +#include "llvm/MC/MCInstrInfo.h" + +using namespace llvm; + +#define GET_INSTRINFO_MC_DESC +#include "R600GenInstrInfo.inc" + +MCInstrInfo *llvm::createR600MCInstrInfo() { + MCInstrInfo *X = new MCInstrInfo(); + InitR600MCInstrInfo(X); + return X; +} diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 94c0157edeb5..36913bd04274 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief The SI code emitter produces machine code that can be executed +/// The SI code emitter produces machine code that can be executed /// directly on the GPU device. // //===----------------------------------------------------------------------===// @@ -43,7 +43,7 @@ namespace { class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { const MCRegisterInfo &MRI; - /// \brief Encode an fp or int literal + /// Encode an fp or int literal uint32_t getLitEncoding(const MCOperand &MO, const MCOperandInfo &OpInfo, const MCSubtargetInfo &STI) const; @@ -54,7 +54,7 @@ public: SIMCCodeEmitter(const SIMCCodeEmitter &) = delete; SIMCCodeEmitter &operator=(const SIMCCodeEmitter &) = delete; - /// \brief Encode the instruction and write it to the OS. + /// Encode the instruction and write it to the OS. void encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; @@ -64,7 +64,7 @@ public: SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; - /// \brief Use a fixup to encode the simm16 field for SOPP branch + /// Use a fixup to encode the simm16 field for SOPP branch /// instructions. unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, @@ -335,13 +335,24 @@ SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, const MCOperand &MO = MI.getOperand(OpNo); - unsigned Reg = MO.getReg(); - RegEnc |= MRI.getEncodingValue(Reg); - RegEnc &= SDWA9EncValues::SRC_VGPR_MASK; - if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) { - RegEnc |= SDWA9EncValues::SRC_SGPR_MASK; + if (MO.isReg()) { + unsigned Reg = MO.getReg(); + RegEnc |= MRI.getEncodingValue(Reg); + RegEnc &= SDWA9EncValues::SRC_VGPR_MASK; + if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) { + RegEnc |= SDWA9EncValues::SRC_SGPR_MASK; + } + return RegEnc; + } else { + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI); + if (Enc != ~0U && Enc != 255) { + return Enc | SDWA9EncValues::SRC_SGPR_MASK; + } } - return RegEnc; + + llvm_unreachable("Unsupported operand kind"); + return 0; } unsigned @@ -427,3 +438,6 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, llvm_unreachable("Encoding of this operand type is not supported yet."); return 0; } + +#define ENABLE_INSTR_PREDICATE_VERIFIER +#include "AMDGPUGenMCCodeEmitter.inc" diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td index 30a2df510386..1e0bc62c45a6 100644 --- a/lib/Target/AMDGPU/MIMGInstructions.td +++ b/lib/Target/AMDGPU/MIMGInstructions.td @@ -7,9 +7,63 @@ // //===----------------------------------------------------------------------===// -class MIMG_Mask <string op, int channels> { - string Op = op; - int Channels = channels; +// MIMG-specific encoding families to distinguish between semantically +// equivalent machine instructions with different encoding. +// +// - MIMGEncGfx6: encoding introduced with gfx6 (obsoleted for atomics in gfx8) +// - MIMGEncGfx8: encoding introduced with gfx8 for atomics +class MIMGEncoding; + +def MIMGEncGfx6 : MIMGEncoding; +def MIMGEncGfx8 : MIMGEncoding; + +def MIMGEncoding : GenericEnum { + let FilterClass = "MIMGEncoding"; +} + +// Represent an ISA-level opcode, independent of the encoding and the +// vdata/vaddr size. +class MIMGBaseOpcode { + MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(NAME); + bit Store = 0; + bit Atomic = 0; + bit AtomicX2 = 0; // (f)cmpswap + bit Sampler = 0; + bits<8> NumExtraArgs = 0; + bit Gradients = 0; + bit Coordinates = 1; + bit LodOrClampOrMip = 0; + bit HasD16 = 0; +} + +def MIMGBaseOpcode : GenericEnum { + let FilterClass = "MIMGBaseOpcode"; +} + +def MIMGBaseOpcodesTable : GenericTable { + let FilterClass = "MIMGBaseOpcode"; + let CppTypeName = "MIMGBaseOpcodeInfo"; + let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", + "NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip", + "HasD16"]; + GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode; + + let PrimaryKey = ["BaseOpcode"]; + let PrimaryKeyName = "getMIMGBaseOpcodeInfo"; +} + +def MIMGDim : GenericEnum { + let FilterClass = "AMDGPUDimProps"; +} + +def MIMGDimInfoTable : GenericTable { + let FilterClass = "AMDGPUDimProps"; + let CppTypeName = "MIMGDimInfo"; + let Fields = ["Dim", "NumCoords", "NumGradients", "DA"]; + GenericEnum TypeOf_Dim = MIMGDim; + + let PrimaryKey = ["Dim"]; + let PrimaryKeyName = "getMIMGDimInfo"; } class mimg <bits<7> si, bits<7> vi = si> { @@ -17,254 +71,372 @@ class mimg <bits<7> si, bits<7> vi = si> { field bits<7> VI = vi; } -class MIMG_Helper <dag outs, dag ins, string asm, - string dns=""> : MIMG<outs, ins, asm,[]> { +class MIMG <dag outs, string dns = ""> + : InstSI <outs, (ins), "", []> { + + let VM_CNT = 1; + let EXP_CNT = 1; + let MIMG = 1; + let Uses = [EXEC]; let mayLoad = 1; let mayStore = 0; let hasPostISelHook = 1; + let SchedRW = [WriteVMEM]; + let UseNamedOperandTable = 1; + let hasSideEffects = 0; // XXX ???? + + let SubtargetPredicate = isGCN; let DecoderNamespace = dns; let isAsmParserOnly = !if(!eq(dns,""), 1, 0); let AsmMatchConverter = "cvtMIMG"; let usesCustomInserter = 1; - let SchedRW = [WriteVMEM]; + + Instruction Opcode = !cast<Instruction>(NAME); + MIMGBaseOpcode BaseOpcode; + MIMGEncoding MIMGEncoding = MIMGEncGfx6; + bits<8> VDataDwords; + bits<8> VAddrDwords; +} + +def MIMGInfoTable : GenericTable { + let FilterClass = "MIMG"; + let CppTypeName = "MIMGInfo"; + let Fields = ["Opcode", "BaseOpcode", "MIMGEncoding", "VDataDwords", "VAddrDwords"]; + GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode; + GenericEnum TypeOf_MIMGEncoding = MIMGEncoding; + + let PrimaryKey = ["BaseOpcode", "MIMGEncoding", "VDataDwords", "VAddrDwords"]; + let PrimaryKeyName = "getMIMGOpcodeHelper"; +} + +def getMIMGInfo : SearchIndex { + let Table = MIMGInfoTable; + let Key = ["Opcode"]; } class MIMG_NoSampler_Helper <bits<7> op, string asm, RegisterClass dst_rc, RegisterClass addr_rc, - string dns=""> : MIMG_Helper < - (outs dst_rc:$vdata), - (ins addr_rc:$vaddr, SReg_256:$srsrc, - dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, - r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da", - dns>, MIMGe<op> { + string dns=""> + : MIMG <(outs dst_rc:$vdata), dns>, + MIMGe<op> { let ssamp = 0; + let d16 = !if(BaseOpcode.HasD16, ?, 0); + + let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc, + DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, + R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" + #!if(BaseOpcode.HasD16, "$d16", ""); } multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm, - RegisterClass dst_rc, - int channels> { - def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32, - !if(!eq(channels, 1), "AMDGPU", "")>, - MIMG_Mask<asm#"_V1", channels>; - def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>, - MIMG_Mask<asm#"_V2", channels>; - def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>, - MIMG_Mask<asm#"_V4", channels>; -} + RegisterClass dst_rc, + bit enableDisasm> { + let VAddrDwords = 1 in + def NAME # _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32, + !if(enableDisasm, "AMDGPU", "")>; + let VAddrDwords = 2 in + def NAME # _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>; + let VAddrDwords = 3 in + def NAME # _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>; + let VAddrDwords = 4 in + def NAME # _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>; +} + +multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0, + bit isResInfo = 0> { + def "" : MIMGBaseOpcode { + let Coordinates = !if(isResInfo, 0, 1); + let LodOrClampOrMip = mip; + let HasD16 = has_d16; + } -multiclass MIMG_NoSampler <bits<7> op, string asm> { - defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>; - defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>; - defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>; - defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>; + let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), + mayLoad = !if(isResInfo, 0, 1) in { + let VDataDwords = 1 in + defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VGPR_32, 1>; + let VDataDwords = 2 in + defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 0>; + let VDataDwords = 3 in + defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>; + let VDataDwords = 4 in + defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>; + } } class MIMG_Store_Helper <bits<7> op, string asm, RegisterClass data_rc, RegisterClass addr_rc, - string dns = ""> : MIMG_Helper < - (outs), - (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, - dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, - r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da", dns>, MIMGe<op> { + string dns = ""> + : MIMG <(outs), dns>, + MIMGe<op> { let ssamp = 0; - let mayLoad = 1; // TableGen requires this for matching with the intrinsics + let d16 = !if(BaseOpcode.HasD16, ?, 0); + + let mayLoad = 0; let mayStore = 1; - let hasSideEffects = 1; + let hasSideEffects = 0; let hasPostISelHook = 0; let DisableWQM = 1; + + let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, + DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, + R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" + #!if(BaseOpcode.HasD16, "$d16", ""); } multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm, RegisterClass data_rc, - int channels> { - def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32, - !if(!eq(channels, 1), "AMDGPU", "")>, - MIMG_Mask<asm#"_V1", channels>; - def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>, - MIMG_Mask<asm#"_V2", channels>; - def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>, - MIMG_Mask<asm#"_V4", channels>; -} + bit enableDisasm> { + let VAddrDwords = 1 in + def NAME # _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32, + !if(enableDisasm, "AMDGPU", "")>; + let VAddrDwords = 2 in + def NAME # _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>; + let VAddrDwords = 3 in + def NAME # _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>; + let VAddrDwords = 4 in + def NAME # _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>; +} + +multiclass MIMG_Store <bits<7> op, string asm, bit has_d16, bit mip = 0> { + def "" : MIMGBaseOpcode { + let Store = 1; + let LodOrClampOrMip = mip; + let HasD16 = has_d16; + } -multiclass MIMG_Store <bits<7> op, string asm> { - defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>; - defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 2>; - defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 3>; - defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 4>; + let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in { + let VDataDwords = 1 in + defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>; + let VDataDwords = 2 in + defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 0>; + let VDataDwords = 3 in + defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 0>; + let VDataDwords = 4 in + defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 0>; + } } class MIMG_Atomic_Helper <string asm, RegisterClass data_rc, - RegisterClass addr_rc> : MIMG_Helper < - (outs data_rc:$vdst), - (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, - dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, - r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" - > { + RegisterClass addr_rc, string dns="", + bit enableDasm = 0> + : MIMG <(outs data_rc:$vdst), !if(enableDasm, dns, "")> { + let mayLoad = 1; let mayStore = 1; - let hasSideEffects = 1; + let hasSideEffects = 1; // FIXME: Remove this let hasPostISelHook = 0; let DisableWQM = 1; let Constraints = "$vdst = $vdata"; let AsmMatchConverter = "cvtMIMGAtomic"; -} -class MIMG_Atomic_Real_si<mimg op, string name, string asm, - RegisterClass data_rc, RegisterClass addr_rc> : - MIMG_Atomic_Helper<asm, data_rc, addr_rc>, - SIMCInstr<name, SIEncodingFamily.SI>, - MIMGe<op.SI> { - let isCodeGenOnly = 0; - let AssemblerPredicates = [isSICI]; - let DecoderNamespace = "SICI"; - let DisableDecoder = DisableSIDecoder; -} - -class MIMG_Atomic_Real_vi<mimg op, string name, string asm, - RegisterClass data_rc, RegisterClass addr_rc> : - MIMG_Atomic_Helper<asm, data_rc, addr_rc>, - SIMCInstr<name, SIEncodingFamily.VI>, - MIMGe<op.VI> { - let isCodeGenOnly = 0; - let AssemblerPredicates = [isVI]; - let DecoderNamespace = "VI"; - let DisableDecoder = DisableVIDecoder; -} - -multiclass MIMG_Atomic_Helper_m <mimg op, string name, string asm, - RegisterClass data_rc, RegisterClass addr_rc> { - let isPseudo = 1, isCodeGenOnly = 1 in { - def "" : MIMG_Atomic_Helper<asm, data_rc, addr_rc>, - SIMCInstr<name, SIEncodingFamily.NONE>; + let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, + DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, + R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da); + let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"; +} + +multiclass MIMG_Atomic_Helper_m <mimg op, string asm, RegisterClass data_rc, + RegisterClass addr_rc, bit enableDasm = 0> { + let ssamp = 0, d16 = 0 in { + def _si : MIMG_Atomic_Helper<asm, data_rc, addr_rc, "SICI", enableDasm>, + SIMCInstr<NAME, SIEncodingFamily.SI>, + MIMGe<op.SI> { + let AssemblerPredicates = [isSICI]; + let DisableDecoder = DisableSIDecoder; + } + + def _vi : MIMG_Atomic_Helper<asm, data_rc, addr_rc, "VI", enableDasm>, + SIMCInstr<NAME, SIEncodingFamily.VI>, + MIMGe<op.VI> { + let AssemblerPredicates = [isVI]; + let DisableDecoder = DisableVIDecoder; + let MIMGEncoding = MIMGEncGfx8; + } } +} - let ssamp = 0 in { - def _si : MIMG_Atomic_Real_si<op, name, asm, data_rc, addr_rc>; +multiclass MIMG_Atomic_Addr_Helper_m <mimg op, string asm, + RegisterClass data_rc, + bit enableDasm = 0> { + // _V* variants have different address size, but the size is not encoded. + // So only one variant can be disassembled. V1 looks the safest to decode. + let VAddrDwords = 1 in + defm _V1 : MIMG_Atomic_Helper_m <op, asm, data_rc, VGPR_32, enableDasm>; + let VAddrDwords = 2 in + defm _V2 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_64>; + let VAddrDwords = 3 in + defm _V3 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_96>; + let VAddrDwords = 4 in + defm _V4 : MIMG_Atomic_Helper_m <op, asm, data_rc, VReg_128>; +} + +multiclass MIMG_Atomic <mimg op, string asm, bit isCmpSwap = 0> { // 64-bit atomics + def "" : MIMGBaseOpcode { + let Atomic = 1; + let AtomicX2 = isCmpSwap; + } - def _vi : MIMG_Atomic_Real_vi<op, name, asm, data_rc, addr_rc>; + let BaseOpcode = !cast<MIMGBaseOpcode>(NAME) in { + // _V* variants have different dst size, but the size is encoded implicitly, + // using dmask and tfe. Only 32-bit variant is registered with disassembler. + // Other variants are reconstructed by disassembler using dmask and tfe. + let VDataDwords = !if(isCmpSwap, 2, 1) in + defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_64, VGPR_32), 1>; + let VDataDwords = !if(isCmpSwap, 4, 2) in + defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_128, VReg_64)>; } } -multiclass MIMG_Atomic <mimg op, string asm, RegisterClass data_rc = VGPR_32> { - defm _V1 : MIMG_Atomic_Helper_m <op, asm # "_V1", asm, data_rc, VGPR_32>; - defm _V2 : MIMG_Atomic_Helper_m <op, asm # "_V2", asm, data_rc, VReg_64>; - defm _V4 : MIMG_Atomic_Helper_m <op, asm # "_V3", asm, data_rc, VReg_128>; +class MIMG_Sampler_Helper <bits<7> op, string asm, RegisterClass dst_rc, + RegisterClass src_rc, string dns=""> + : MIMG <(outs dst_rc:$vdata), dns>, + MIMGe<op> { + let d16 = !if(BaseOpcode.HasD16, ?, 0); + + let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, + DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, + R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da), + !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); + let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da" + #!if(BaseOpcode.HasD16, "$d16", ""); +} + +class MIMGAddrSize<int dw, bit enable_disasm> { + int NumWords = dw; + + RegisterClass RegClass = !if(!le(NumWords, 0), ?, + !if(!eq(NumWords, 1), VGPR_32, + !if(!eq(NumWords, 2), VReg_64, + !if(!eq(NumWords, 3), VReg_96, + !if(!eq(NumWords, 4), VReg_128, + !if(!le(NumWords, 8), VReg_256, + !if(!le(NumWords, 16), VReg_512, ?))))))); + + // Whether the instruction variant with this vaddr size should be enabled for + // the auto-generated disassembler. + bit Disassemble = enable_disasm; +} + +// Return whether a value inside the range [min, max] (endpoints inclusive) +// is in the given list. +class isRangeInList<int min, int max, list<int> lst> { + bit ret = !foldl(0, lst, lhs, y, !or(lhs, !and(!le(min, y), !le(y, max)))); +} + +class MIMGAddrSizes_tmp<list<MIMGAddrSize> lst, int min> { + list<MIMGAddrSize> List = lst; + int Min = min; +} + +class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample> { + // List of all possible numbers of address words, taking all combinations of + // A16 and image dimension into account (note: no MSAA, since this is for + // sample/gather ops). + list<int> AllNumAddrWords = + !foreach(dw, !if(sample.Gradients, + !if(!eq(sample.LodOrClamp, ""), + [2, 3, 4, 5, 6, 7, 9], + [2, 3, 4, 5, 7, 8, 10]), + !if(!eq(sample.LodOrClamp, ""), + [1, 2, 3], + [1, 2, 3, 4])), + !add(dw, !size(sample.ExtraAddrArgs))); + + // Generate machine instructions based on possible register classes for the + // required numbers of address words. The disassembler defaults to the + // smallest register class. + list<MIMGAddrSize> MachineInstrs = + !foldl(MIMGAddrSizes_tmp<[], 0>, [1, 2, 3, 4, 8, 16], lhs, dw, + !if(isRangeInList<lhs.Min, dw, AllNumAddrWords>.ret, + MIMGAddrSizes_tmp< + !listconcat(lhs.List, [MIMGAddrSize<dw, !empty(lhs.List)>]), + !if(!eq(dw, 3), 3, !add(dw, 1))>, // we still need _V4 for codegen w/ 3 dwords + lhs)).List; } -class MIMG_Sampler_Helper <bits<7> op, string asm, - RegisterClass dst_rc, - RegisterClass src_rc, - bit wqm, - string dns=""> : MIMG_Helper < - (outs dst_rc:$vdata), - (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, - dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, - r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da", - dns>, MIMGe<op> { - let WQM = wqm; +multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm, + AMDGPUSampleVariant sample, RegisterClass dst_rc, + bit enableDisasm = 0> { + foreach addr = MIMG_Sampler_AddrSizes<sample>.MachineInstrs in { + let VAddrDwords = addr.NumWords in + def _V # addr.NumWords + : MIMG_Sampler_Helper <op, asm, dst_rc, addr.RegClass, + !if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>; + } } -multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm, - RegisterClass dst_rc, - int channels, bit wqm> { - def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm, - !if(!eq(channels, 1), "AMDGPU", "")>, - MIMG_Mask<asm#"_V1", channels>; - def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>, - MIMG_Mask<asm#"_V2", channels>; - def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128, wqm>, - MIMG_Mask<asm#"_V4", channels>; - def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256, wqm>, - MIMG_Mask<asm#"_V8", channels>; - def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512, wqm>, - MIMG_Mask<asm#"_V16", channels>; -} - -multiclass MIMG_Sampler <bits<7> op, string asm, bit wqm=0> { - defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, wqm>; - defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, wqm>; - defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, wqm>; - defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, wqm>; -} - -multiclass MIMG_Sampler_WQM <bits<7> op, string asm> : MIMG_Sampler<op, asm, 1>; - -class MIMG_Gather_Helper <bits<7> op, string asm, - RegisterClass dst_rc, - RegisterClass src_rc, bit wqm> : MIMG < - (outs dst_rc:$vdata), - (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, - dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, - r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da", - []>, MIMGe<op> { - let mayLoad = 1; - let mayStore = 0; +class MIMG_Sampler_BaseOpcode<AMDGPUSampleVariant sample> + : MIMGBaseOpcode { + let Sampler = 1; + let NumExtraArgs = !size(sample.ExtraAddrArgs); + let Gradients = sample.Gradients; + let LodOrClampOrMip = !ne(sample.LodOrClamp, ""); +} - // DMASK was repurposed for GATHER4. 4 components are always - // returned and DMASK works like a swizzle - it selects - // the component to fetch. The only useful DMASK values are - // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns - // (red,red,red,red) etc.) The ISA document doesn't mention - // this. - // Therefore, disable all code which updates DMASK by setting this: - let Gather4 = 1; - let hasPostISelHook = 0; - let WQM = wqm; +multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0, + bit isGetLod = 0, + string asm = "image_sample"#sample.LowerCaseMod> { + def "" : MIMG_Sampler_BaseOpcode<sample> { + let HasD16 = !if(isGetLod, 0, 1); + } - let isAsmParserOnly = 1; // TBD: fix it later + let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm, + mayLoad = !if(isGetLod, 0, 1) in { + let VDataDwords = 1 in + defm _V1 : MIMG_Sampler_Src_Helper<op, asm, sample, VGPR_32, 1>; + let VDataDwords = 2 in + defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; + let VDataDwords = 3 in + defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>; + let VDataDwords = 4 in + defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>; + } } -multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm, - RegisterClass dst_rc, - int channels, bit wqm> { - def _V1 : MIMG_Gather_Helper <op, asm, dst_rc, VGPR_32, wqm>, - MIMG_Mask<asm#"_V1", channels>; - def _V2 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_64, wqm>, - MIMG_Mask<asm#"_V2", channels>; - def _V4 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_128, wqm>, - MIMG_Mask<asm#"_V4", channels>; - def _V8 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_256, wqm>, - MIMG_Mask<asm#"_V8", channels>; - def _V16 : MIMG_Gather_Helper <op, asm, dst_rc, VReg_512, wqm>, - MIMG_Mask<asm#"_V16", channels>; -} +multiclass MIMG_Sampler_WQM <bits<7> op, AMDGPUSampleVariant sample> + : MIMG_Sampler<op, sample, 1>; -multiclass MIMG_Gather <bits<7> op, string asm, bit wqm=0> { - defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, wqm>; - defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, wqm>; - defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, wqm>; - defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, wqm>; +multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0, + string asm = "image_gather4"#sample.LowerCaseMod> { + def "" : MIMG_Sampler_BaseOpcode<sample> { + let HasD16 = 1; + } + + let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm, + Gather4 = 1, hasPostISelHook = 0 in { + let VDataDwords = 2 in + defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */ + let VDataDwords = 4 in + defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>; + } } -multiclass MIMG_Gather_WQM <bits<7> op, string asm> : MIMG_Gather<op, asm, 1>; +multiclass MIMG_Gather_WQM <bits<7> op, AMDGPUSampleVariant sample> + : MIMG_Gather<op, sample, 1>; //===----------------------------------------------------------------------===// // MIMG Instructions //===----------------------------------------------------------------------===// -let SubtargetPredicate = isGCN in { -defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load">; -defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">; -//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"image_load_pck", 0x00000002>; -//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>; -//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>; -//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>; -defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">; -defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">; -//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>; -//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>; - -let mayLoad = 0, mayStore = 0 in { -defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">; -} +defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "image_load", 1>; +defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip", 1, 1>; +defm IMAGE_LOAD_PCK : MIMG_NoSampler <0x00000002, "image_load_pck", 0>; +defm IMAGE_LOAD_PCK_SGN : MIMG_NoSampler <0x00000003, "image_load_pck_sgn", 0>; +defm IMAGE_LOAD_MIP_PCK : MIMG_NoSampler <0x00000004, "image_load_mip_pck", 0, 1>; +defm IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoSampler <0x00000005, "image_load_mip_pck_sgn", 0, 1>; +defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store", 1>; +defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip", 1, 1>; +defm IMAGE_STORE_PCK : MIMG_Store <0x0000000a, "image_store_pck", 0>; +defm IMAGE_STORE_MIP_PCK : MIMG_Store <0x0000000b, "image_store_mip_pck", 0, 1>; + +defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo", 0, 1, 1>; defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">; -defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", VReg_64>; +defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", 1>; defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">; defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimg<0x12, 0x13>, "image_atomic_sub">; //def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -- not on VI @@ -277,397 +449,101 @@ defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">; defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">; defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">; defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">; -//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -- not on VI +//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d, 1>; -- not on VI //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI -defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, "image_sample">; -defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">; -defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">; -defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, "image_sample_d_cl">; -defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "image_sample_l">; -defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, "image_sample_b">; -defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, "image_sample_b_cl">; -defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, "image_sample_lz">; -defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, "image_sample_c">; -defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, "image_sample_c_cl">; -defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "image_sample_c_d">; -defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, "image_sample_c_d_cl">; -defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "image_sample_c_l">; -defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, "image_sample_c_b">; -defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, "image_sample_c_b_cl">; -defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, "image_sample_c_lz">; -defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, "image_sample_o">; -defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, "image_sample_cl_o">; -defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, "image_sample_d_o">; -defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, "image_sample_d_cl_o">; -defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, "image_sample_l_o">; -defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, "image_sample_b_o">; -defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, "image_sample_b_cl_o">; -defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, "image_sample_lz_o">; -defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, "image_sample_c_o">; -defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, "image_sample_c_cl_o">; -defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, "image_sample_c_d_o">; -defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, "image_sample_c_d_cl_o">; -defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, "image_sample_c_l_o">; -defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, "image_sample_c_b_o">; -defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, "image_sample_c_b_cl_o">; -defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, "image_sample_c_lz_o">; -defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, "image_gather4">; -defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, "image_gather4_cl">; -defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, "image_gather4_l">; -defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, "image_gather4_b">; -defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, "image_gather4_b_cl">; -defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, "image_gather4_lz">; -defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, "image_gather4_c">; -defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, "image_gather4_c_cl">; -defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, "image_gather4_c_l">; -defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, "image_gather4_c_b">; -defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, "image_gather4_c_b_cl">; -defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, "image_gather4_c_lz">; -defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, "image_gather4_o">; -defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, "image_gather4_cl_o">; -defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, "image_gather4_l_o">; -defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, "image_gather4_b_o">; -defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, "image_gather4_b_cl_o">; -defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, "image_gather4_lz_o">; -defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, "image_gather4_c_o">; -defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, "image_gather4_c_cl_o">; -defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">; -defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">; -defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">; -defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">; - -let mayLoad = 0, mayStore = 0 in { -defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">; -} - -defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">; -defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">; -defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">; -defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, "image_sample_c_cd_cl">; -defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, "image_sample_cd_o">; -defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, "image_sample_cd_cl_o">; -defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, "image_sample_c_cd_o">; -defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o">; +defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>; +defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>; +defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, AMDGPUSample_d>; +defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>; +defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, AMDGPUSample_l>; +defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>; +defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>; +defm IMAGE_SAMPLE_LZ : MIMG_Sampler <0x00000027, AMDGPUSample_lz>; +defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>; +defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>; +defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>; +defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>; +defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>; +defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>; +defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>; +defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>; +defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>; +defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>; +defm IMAGE_SAMPLE_D_O : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>; +defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>; +defm IMAGE_SAMPLE_L_O : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>; +defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>; +defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>; +defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>; +defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>; +defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>; +defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>; +defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>; +defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>; +defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>; +defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>; +defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>; +defm IMAGE_GATHER4 : MIMG_Gather_WQM <0x00000040, AMDGPUSample>; +defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>; +defm IMAGE_GATHER4_L : MIMG_Gather <0x00000044, AMDGPUSample_l>; +defm IMAGE_GATHER4_B : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>; +defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>; +defm IMAGE_GATHER4_LZ : MIMG_Gather <0x00000047, AMDGPUSample_lz>; +defm IMAGE_GATHER4_C : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>; +defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>; +defm IMAGE_GATHER4_C_L : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>; +defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>; +defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>; +defm IMAGE_GATHER4_C_LZ : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>; +defm IMAGE_GATHER4_O : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>; +defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>; +defm IMAGE_GATHER4_L_O : MIMG_Gather <0x00000054, AMDGPUSample_l_o>; +defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>; +defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>; +defm IMAGE_GATHER4_LZ_O : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>; +defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>; +defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>; +defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>; +defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>; +defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>; +defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>; + +defm IMAGE_GET_LOD : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 1, "image_get_lod">; + +defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, AMDGPUSample_cd>; +defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>; +defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>; +defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>; +defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>; +defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>; +defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>; +defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>; //def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>; //def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>; + +/********** ========================================= **********/ +/********** Table of dimension-aware image intrinsics **********/ +/********** ========================================= **********/ + +class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> { + Intrinsic Intr = I; + MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(!strconcat("IMAGE_", I.P.OpMod)); + AMDGPUDimProps Dim = I.P.Dim; } -/********** ======================= **********/ -/********** Image sampling patterns **********/ -/********** ======================= **********/ - -// Image + sampler -class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat < - (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm, - i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), - (opcode $addr, $rsrc, $sampler, - (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da)) ->; - -multiclass SampleRawPatterns<SDPatternOperator name, string opcode> { - def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>; - def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>; - def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>; - def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V8), v8i32>; - def : SampleRawPattern<name, !cast<MIMG>(opcode # _V4_V16), v16i32>; -} - -// Image + sampler for amdgcn -// TODO: -// 1. Handle half data type like v4f16, and add D16 bit support; -// 2. Handle v4i32 rsrc type (Register Class for the instruction to be SReg_128). -// 3. Add A16 support when we pass address of half type. -multiclass AMDGCNSamplePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> { - def : GCNPat< - (dt (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i1:$unorm, i1:$glc, - i1:$slc, i1:$lwe, i1:$da)), - (opcode $addr, $rsrc, $sampler, - (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), - 0, 0, (as_i1imm $lwe), (as_i1imm $da)) - >; -} - -multiclass AMDGCNSampleDataPatterns<SDPatternOperator name, string opcode, ValueType dt> { - defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V1), dt, f32>; - defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V2), dt, v2f32>; - defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V4), dt, v4f32>; - defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V8), dt, v8f32>; - defm : AMDGCNSamplePattern<name, !cast<MIMG>(opcode # _V16), dt, v16f32>; -} - -// TODO: support v3f32. -multiclass AMDGCNSamplePatterns<SDPatternOperator name, string opcode> { - defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V1), f32>; - defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V2), v2f32>; - defm : AMDGCNSampleDataPatterns<name, !cast<string>(opcode # _V4), v4f32>; -} - -// Image only -class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat < - (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm, - imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe), - (opcode $addr, $rsrc, - (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da)) ->; - -multiclass ImagePatterns<SDPatternOperator name, string opcode> { - def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>; - def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>; - def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>; -} - -multiclass ImageLoadPattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> { - def : GCNPat < - (dt (name vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe, - i1:$da)), - (opcode $addr, $rsrc, - (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc), - 0, 0, (as_i1imm $lwe), (as_i1imm $da)) - >; -} - -multiclass ImageLoadDataPatterns<SDPatternOperator name, string opcode, ValueType dt> { - defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V1), dt, i32>; - defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V2), dt, v2i32>; - defm : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4), dt, v4i32>; -} - -// TODO: support v3f32. -multiclass ImageLoadPatterns<SDPatternOperator name, string opcode> { - defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), f32>; - defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v2f32>; - defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V4), v4f32>; -} - -multiclass ImageStorePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> { - def : GCNPat < - (name dt:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, - i1:$lwe, i1:$da), - (opcode $data, $addr, $rsrc, - (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc), - 0, 0, (as_i1imm $lwe), (as_i1imm $da)) - >; -} - -multiclass ImageStoreDataPatterns<SDPatternOperator name, string opcode, ValueType dt> { - defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V1), dt, i32>; - defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V2), dt, v2i32>; - defm : ImageStorePattern<name, !cast<MIMG>(opcode # _V4), dt, v4i32>; -} - -// TODO: support v3f32. -multiclass ImageStorePatterns<SDPatternOperator name, string opcode> { - defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V1), f32>; - defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V2), v2f32>; - defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V4), v4f32>; -} - -class ImageAtomicPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat < - (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc), - (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)) ->; - -multiclass ImageAtomicPatterns<SDPatternOperator name, string opcode> { - def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V1), i32>; - def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V2), v2i32>; - def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V4), v4i32>; -} - -class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : GCNPat < - (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc, - imm:$r128, imm:$da, imm:$slc), - (EXTRACT_SUBREG - (opcode (REG_SEQUENCE VReg_64, $vsrc, sub0, $vcmp, sub1), - $addr, $rsrc, 3, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)), - sub0) ->; - -// ======= amdgcn Image Intrinsics ============== - -// Image load -defm : ImageLoadPatterns<int_amdgcn_image_load, "IMAGE_LOAD">; -defm : ImageLoadPatterns<int_amdgcn_image_load_mip, "IMAGE_LOAD_MIP">; -defm : ImageLoadPatterns<int_amdgcn_image_getresinfo, "IMAGE_GET_RESINFO">; - -// Image store -defm : ImageStorePatterns<int_amdgcn_image_store, "IMAGE_STORE">; -defm : ImageStorePatterns<int_amdgcn_image_store_mip, "IMAGE_STORE_MIP">; - -// Basic sample -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample, "IMAGE_SAMPLE">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cl, "IMAGE_SAMPLE_CL">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d, "IMAGE_SAMPLE_D">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_cl, "IMAGE_SAMPLE_D_CL">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_l, "IMAGE_SAMPLE_L">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b, "IMAGE_SAMPLE_B">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_cl, "IMAGE_SAMPLE_B_CL">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_lz, "IMAGE_SAMPLE_LZ">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd, "IMAGE_SAMPLE_CD">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_cl, "IMAGE_SAMPLE_CD_CL">; - -// Sample with comparison -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c, "IMAGE_SAMPLE_C">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cl, "IMAGE_SAMPLE_C_CL">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d, "IMAGE_SAMPLE_C_D">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_cl, "IMAGE_SAMPLE_C_D_CL">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_l, "IMAGE_SAMPLE_C_L">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b, "IMAGE_SAMPLE_C_B">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_cl, "IMAGE_SAMPLE_C_B_CL">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_lz, "IMAGE_SAMPLE_C_LZ">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd, "IMAGE_SAMPLE_C_CD">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_cl, "IMAGE_SAMPLE_C_CD_CL">; - -// Sample with offsets -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_o, "IMAGE_SAMPLE_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cl_o, "IMAGE_SAMPLE_CL_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_o, "IMAGE_SAMPLE_D_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_d_cl_o, "IMAGE_SAMPLE_D_CL_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_l_o, "IMAGE_SAMPLE_L_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_o, "IMAGE_SAMPLE_B_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_b_cl_o, "IMAGE_SAMPLE_B_CL_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_lz_o, "IMAGE_SAMPLE_LZ_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_o, "IMAGE_SAMPLE_CD_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_cd_cl_o, "IMAGE_SAMPLE_CD_CL_O">; - -// Sample with comparison and offsets -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_o, "IMAGE_SAMPLE_C_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cl_o, "IMAGE_SAMPLE_C_CL_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_o, "IMAGE_SAMPLE_C_D_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_d_cl_o, "IMAGE_SAMPLE_C_D_CL_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_l_o, "IMAGE_SAMPLE_C_L_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_o, "IMAGE_SAMPLE_C_B_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_b_cl_o, "IMAGE_SAMPLE_C_B_CL_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_lz_o, "IMAGE_SAMPLE_C_LZ_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_o, "IMAGE_SAMPLE_C_CD_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">; - -// Gather opcodes -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4, "IMAGE_GATHER4">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_cl, "IMAGE_GATHER4_CL">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_l, "IMAGE_GATHER4_L">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b, "IMAGE_GATHER4_B">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_cl, "IMAGE_GATHER4_B_CL">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_lz, "IMAGE_GATHER4_LZ">; - -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c, "IMAGE_GATHER4_C">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_cl, "IMAGE_GATHER4_C_CL">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_l, "IMAGE_GATHER4_C_L">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b, "IMAGE_GATHER4_C_B">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_cl, "IMAGE_GATHER4_C_B_CL">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_lz, "IMAGE_GATHER4_C_LZ">; - -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_o, "IMAGE_GATHER4_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_cl_o, "IMAGE_GATHER4_CL_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_l_o, "IMAGE_GATHER4_L_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_o, "IMAGE_GATHER4_B_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_b_cl_o, "IMAGE_GATHER4_B_CL_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_lz_o, "IMAGE_GATHER4_LZ_O">; - -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_o, "IMAGE_GATHER4_C_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_cl_o, "IMAGE_GATHER4_C_CL_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_l_o, "IMAGE_GATHER4_C_L_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_o, "IMAGE_GATHER4_C_B_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_b_cl_o, "IMAGE_GATHER4_C_B_CL_O">; -defm : AMDGCNSamplePatterns<int_amdgcn_image_gather4_c_lz_o, "IMAGE_GATHER4_C_LZ_O">; - -defm : AMDGCNSamplePatterns<int_amdgcn_image_getlod, "IMAGE_GET_LOD">; - -// Image atomics -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_swap, "IMAGE_ATOMIC_SWAP">; -def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V1, i32>; -def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V2, v2i32>; -def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V4, v4i32>; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_add, "IMAGE_ATOMIC_ADD">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_sub, "IMAGE_ATOMIC_SUB">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smin, "IMAGE_ATOMIC_SMIN">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umin, "IMAGE_ATOMIC_UMIN">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smax, "IMAGE_ATOMIC_SMAX">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umax, "IMAGE_ATOMIC_UMAX">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_and, "IMAGE_ATOMIC_AND">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_or, "IMAGE_ATOMIC_OR">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_xor, "IMAGE_ATOMIC_XOR">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_inc, "IMAGE_ATOMIC_INC">; -defm : ImageAtomicPatterns<int_amdgcn_image_atomic_dec, "IMAGE_ATOMIC_DEC">; - -/* SIsample for simple 1D texture lookup */ -def : GCNPat < - (SIsample i32:$addr, v8i32:$rsrc, v4i32:$sampler, imm), - (IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) ->; - -class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat < - (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, imm), - (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) ->; - -class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat < - (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_RECT), - (opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0) ->; - -class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat < - (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_ARRAY), - (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1) ->; - -class SampleShadowPattern<SDNode name, MIMG opcode, - ValueType vt> : GCNPat < - (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW), - (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) ->; - -class SampleShadowArrayPattern<SDNode name, MIMG opcode, - ValueType vt> : GCNPat < - (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY), - (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1) ->; - -/* SIsample* for texture lookups consuming more address parameters */ -multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l, - MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b, -MIMG sample_d, MIMG sample_c_d, ValueType addr_type> { - def : SamplePattern <SIsample, sample, addr_type>; - def : SampleRectPattern <SIsample, sample, addr_type>; - def : SampleArrayPattern <SIsample, sample, addr_type>; - def : SampleShadowPattern <SIsample, sample_c, addr_type>; - def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>; - - def : SamplePattern <SIsamplel, sample_l, addr_type>; - def : SampleArrayPattern <SIsamplel, sample_l, addr_type>; - def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>; - def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>; - - def : SamplePattern <SIsampleb, sample_b, addr_type>; - def : SampleArrayPattern <SIsampleb, sample_b, addr_type>; - def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>; - def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>; - - def : SamplePattern <SIsampled, sample_d, addr_type>; - def : SampleArrayPattern <SIsampled, sample_d, addr_type>; - def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>; - def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>; -} - -defm : SamplePatterns<IMAGE_SAMPLE_V4_V2, IMAGE_SAMPLE_C_V4_V2, - IMAGE_SAMPLE_L_V4_V2, IMAGE_SAMPLE_C_L_V4_V2, - IMAGE_SAMPLE_B_V4_V2, IMAGE_SAMPLE_C_B_V4_V2, - IMAGE_SAMPLE_D_V4_V2, IMAGE_SAMPLE_C_D_V4_V2, - v2i32>; -defm : SamplePatterns<IMAGE_SAMPLE_V4_V4, IMAGE_SAMPLE_C_V4_V4, - IMAGE_SAMPLE_L_V4_V4, IMAGE_SAMPLE_C_L_V4_V4, - IMAGE_SAMPLE_B_V4_V4, IMAGE_SAMPLE_C_B_V4_V4, - IMAGE_SAMPLE_D_V4_V4, IMAGE_SAMPLE_C_D_V4_V4, - v4i32>; -defm : SamplePatterns<IMAGE_SAMPLE_V4_V8, IMAGE_SAMPLE_C_V4_V8, - IMAGE_SAMPLE_L_V4_V8, IMAGE_SAMPLE_C_L_V4_V8, - IMAGE_SAMPLE_B_V4_V8, IMAGE_SAMPLE_C_B_V4_V8, - IMAGE_SAMPLE_D_V4_V8, IMAGE_SAMPLE_C_D_V4_V8, - v8i32>; -defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16, - IMAGE_SAMPLE_L_V4_V16, IMAGE_SAMPLE_C_L_V4_V16, - IMAGE_SAMPLE_B_V4_V16, IMAGE_SAMPLE_C_B_V4_V16, - IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16, - v16i32>; +def ImageDimIntrinsicTable : GenericTable { + let FilterClass = "ImageDimIntrinsicInfo"; + let Fields = ["Intr", "BaseOpcode", "Dim"]; + GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode; + GenericEnum TypeOf_Dim = MIMGDim; + + let PrimaryKey = ["Intr"]; + let PrimaryKeyName = "getImageDimIntrinsicInfo"; + let PrimaryKeyEarlyOut = 1; +} + +foreach intr = !listconcat(AMDGPUImageDimIntrinsics, + AMDGPUImageDimAtomicIntrinsics) in { + def : ImageDimIntrinsicInfo<intr>; +} diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td deleted file mode 100644 index d50dae78e247..000000000000 --- a/lib/Target/AMDGPU/Processors.td +++ /dev/null @@ -1,12 +0,0 @@ -//===-- Processors.td - AMDGPU Processor definitions ----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -FIXME: Deleting this file broke buildbots that don't do full rebuilds. This -file is no longer used by the backend, so it can be deleted once all -the buildbots update there dependencies. diff --git a/lib/Target/AMDGPU/R600.td b/lib/Target/AMDGPU/R600.td new file mode 100644 index 000000000000..5c9c1c1ed504 --- /dev/null +++ b/lib/Target/AMDGPU/R600.td @@ -0,0 +1,54 @@ +//===-- R600.td - R600 Tablegen files ----------------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +include "llvm/Target/Target.td" + +def R600InstrInfo : InstrInfo { + let guessInstructionProperties = 1; + let noNamedPositionallyEncodedOperands = 1; +} + +def R600 : Target { + let InstructionSet = R600InstrInfo; + let AllowRegisterRenaming = 1; +} + +let Namespace = "R600" in { + +foreach Index = 0-15 in { + def sub#Index : SubRegIndex<32, !shl(Index, 5)>; +} + +include "R600RegisterInfo.td" + +} + +def NullALU : InstrItinClass; +def ALU_NULL : FuncUnit; + +include "AMDGPUFeatures.td" +include "R600Schedule.td" +include "R600Processors.td" +include "AMDGPUInstrInfo.td" +include "AMDGPUInstructions.td" +include "R600Instructions.td" +include "R700Instructions.td" +include "EvergreenInstructions.td" +include "CaymanInstructions.td" + +// Calling convention for R600 +def CC_R600 : CallingConv<[ + CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[ + T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW, + T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW, + T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW, + T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW, + T30_XYZW, T31_XYZW, T32_XYZW + ]>>> +]>; diff --git a/lib/Target/AMDGPU/R600AsmPrinter.cpp b/lib/Target/AMDGPU/R600AsmPrinter.cpp new file mode 100644 index 000000000000..68f8c30775b8 --- /dev/null +++ b/lib/Target/AMDGPU/R600AsmPrinter.cpp @@ -0,0 +1,133 @@ +//===-- R600AsmPrinter.cpp - R600 Assebly printer ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// The R600AsmPrinter is used to print both assembly string and also binary +/// code. When passed an MCAsmStreamer it prints assembly and when passed +/// an MCObjectStreamer it outputs binary code. +// +//===----------------------------------------------------------------------===// + +#include "R600AsmPrinter.h" +#include "AMDGPUSubtarget.h" +#include "R600Defines.h" +#include "R600MachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Target/TargetLoweringObjectFile.h" + +using namespace llvm; + +AsmPrinter * +llvm::createR600AsmPrinterPass(TargetMachine &TM, + std::unique_ptr<MCStreamer> &&Streamer) { + return new R600AsmPrinter(TM, std::move(Streamer)); +} + +R600AsmPrinter::R600AsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer) + : AsmPrinter(TM, std::move(Streamer)) { } + +StringRef R600AsmPrinter::getPassName() const { + return "R600 Assembly Printer"; +} + +void R600AsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { + unsigned MaxGPR = 0; + bool killPixel = false; + const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>(); + const R600RegisterInfo *RI = STM.getRegisterInfo(); + const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + if (MI.getOpcode() == R600::KILLGT) + killPixel = true; + unsigned numOperands = MI.getNumOperands(); + for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { + const MachineOperand &MO = MI.getOperand(op_idx); + if (!MO.isReg()) + continue; + unsigned HWReg = RI->getHWRegIndex(MO.getReg()); + + // Register with value > 127 aren't GPR + if (HWReg > 127) + continue; + MaxGPR = std::max(MaxGPR, HWReg); + } + } + } + + unsigned RsrcReg; + if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) { + // Evergreen / Northern Islands + switch (MF.getFunction().getCallingConv()) { + default: LLVM_FALLTHROUGH; + case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; + case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; + case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break; + case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break; + } + } else { + // R600 / R700 + switch (MF.getFunction().getCallingConv()) { + default: LLVM_FALLTHROUGH; + case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH; + case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH; + case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break; + case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break; + } + } + + OutStreamer->EmitIntValue(RsrcReg, 4); + OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) | + S_STACK_SIZE(MFI->CFStackSize), 4); + OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); + OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); + + if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { + OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); + OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4); + } +} + +bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) { + + + // Functions needs to be cacheline (256B) aligned. + MF.ensureAlignment(8); + + SetupMachineFunction(MF); + + MCContext &Context = getObjFileLowering().getContext(); + MCSectionELF *ConfigSection = + Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0); + OutStreamer->SwitchSection(ConfigSection); + + EmitProgramInfoR600(MF); + + EmitFunctionBody(); + + if (isVerbose()) { + MCSectionELF *CommentSection = + Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); + OutStreamer->SwitchSection(CommentSection); + + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + OutStreamer->emitRawComment( + Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->CFStackSize))); + } + + return false; +} + diff --git a/lib/Target/AMDGPU/R600AsmPrinter.h b/lib/Target/AMDGPU/R600AsmPrinter.h new file mode 100644 index 000000000000..079fc707b03c --- /dev/null +++ b/lib/Target/AMDGPU/R600AsmPrinter.h @@ -0,0 +1,46 @@ +//===-- R600AsmPrinter.h - Print R600 assembly code -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// R600 Assembly printer class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_R600ASMPRINTER_H +#define LLVM_LIB_TARGET_AMDGPU_R600ASMPRINTER_H + +#include "llvm/CodeGen/AsmPrinter.h" + +namespace llvm { + +class R600AsmPrinter final : public AsmPrinter { + +public: + explicit R600AsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer); + StringRef getPassName() const override; + bool runOnMachineFunction(MachineFunction &MF) override; + /// Implemented in AMDGPUMCInstLower.cpp + void EmitInstruction(const MachineInstr *MI) override; + /// Lower the specified LLVM Constant to an MCExpr. + /// The AsmPrinter::lowerConstantof does not know how to lower + /// addrspacecast, therefore they should be lowered by this function. + const MCExpr *lowerConstant(const Constant *CV) override; + +private: + void EmitProgramInfoR600(const MachineFunction &MF); +}; + +AsmPrinter * +createR600AsmPrinterPass(TargetMachine &TM, + std::unique_ptr<MCStreamer> &&Streamer); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_R600ASMPRINTER_H diff --git a/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/lib/Target/AMDGPU/R600ClauseMergePass.cpp index 5e1ba6b506da..0c62d6a4b3d9 100644 --- a/lib/Target/AMDGPU/R600ClauseMergePass.cpp +++ b/lib/Target/AMDGPU/R600ClauseMergePass.cpp @@ -19,6 +19,7 @@ #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -33,8 +34,8 @@ namespace { static bool isCFAlu(const MachineInstr &MI) { switch (MI.getOpcode()) { - case AMDGPU::CF_ALU: - case AMDGPU::CF_ALU_PUSH_BEFORE: + case R600::CF_ALU: + case R600::CF_ALU_PUSH_BEFORE: return true; default: return false; @@ -84,20 +85,20 @@ char &llvm::R600ClauseMergePassID = R600ClauseMergePass::ID; unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr &MI) const { assert(isCFAlu(MI)); return MI - .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::COUNT)) + .getOperand(TII->getOperandIdx(MI.getOpcode(), R600::OpName::COUNT)) .getImm(); } bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr &MI) const { assert(isCFAlu(MI)); return MI - .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::Enabled)) + .getOperand(TII->getOperandIdx(MI.getOpcode(), R600::OpName::Enabled)) .getImm(); } void R600ClauseMergePass::cleanPotentialDisabledCFAlu( MachineInstr &CFAlu) const { - int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); + int CntIdx = TII->getOperandIdx(R600::CF_ALU, R600::OpName::COUNT); MachineBasicBlock::iterator I = CFAlu, E = CFAlu.getParent()->end(); I++; do { @@ -116,46 +117,46 @@ void R600ClauseMergePass::cleanPotentialDisabledCFAlu( bool R600ClauseMergePass::mergeIfPossible(MachineInstr &RootCFAlu, const MachineInstr &LatrCFAlu) const { assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu)); - int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); + int CntIdx = TII->getOperandIdx(R600::CF_ALU, R600::OpName::COUNT); unsigned RootInstCount = getCFAluSize(RootCFAlu), LaterInstCount = getCFAluSize(LatrCFAlu); unsigned CumuledInsts = RootInstCount + LaterInstCount; if (CumuledInsts >= TII->getMaxAlusPerClause()) { - DEBUG(dbgs() << "Excess inst counts\n"); + LLVM_DEBUG(dbgs() << "Excess inst counts\n"); return false; } - if (RootCFAlu.getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) + if (RootCFAlu.getOpcode() == R600::CF_ALU_PUSH_BEFORE) return false; // Is KCache Bank 0 compatible ? int Mode0Idx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0); + TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_MODE0); int KBank0Idx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0); + TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_BANK0); int KBank0LineIdx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0); + TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_ADDR0); if (LatrCFAlu.getOperand(Mode0Idx).getImm() && RootCFAlu.getOperand(Mode0Idx).getImm() && (LatrCFAlu.getOperand(KBank0Idx).getImm() != RootCFAlu.getOperand(KBank0Idx).getImm() || LatrCFAlu.getOperand(KBank0LineIdx).getImm() != RootCFAlu.getOperand(KBank0LineIdx).getImm())) { - DEBUG(dbgs() << "Wrong KC0\n"); + LLVM_DEBUG(dbgs() << "Wrong KC0\n"); return false; } // Is KCache Bank 1 compatible ? int Mode1Idx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1); + TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_MODE1); int KBank1Idx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1); + TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_BANK1); int KBank1LineIdx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1); + TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_ADDR1); if (LatrCFAlu.getOperand(Mode1Idx).getImm() && RootCFAlu.getOperand(Mode1Idx).getImm() && (LatrCFAlu.getOperand(KBank1Idx).getImm() != RootCFAlu.getOperand(KBank1Idx).getImm() || LatrCFAlu.getOperand(KBank1LineIdx).getImm() != RootCFAlu.getOperand(KBank1LineIdx).getImm())) { - DEBUG(dbgs() << "Wrong KC0\n"); + LLVM_DEBUG(dbgs() << "Wrong KC0\n"); return false; } if (LatrCFAlu.getOperand(Mode0Idx).getImm()) { diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index 0e788df1c9c0..a19020276f35 100644 --- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -19,6 +19,7 @@ #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" @@ -93,7 +94,7 @@ bool CFStack::branchStackContains(CFStack::StackItem Item) { } bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { - if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() && + if (Opcode == R600::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() && getLoopDepth() > 1) return true; @@ -102,10 +103,10 @@ bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { switch(Opcode) { default: return false; - case AMDGPU::CF_ALU_PUSH_BEFORE: - case AMDGPU::CF_ALU_ELSE_AFTER: - case AMDGPU::CF_ALU_BREAK: - case AMDGPU::CF_ALU_CONTINUE: + case R600::CF_ALU_PUSH_BEFORE: + case R600::CF_ALU_ELSE_AFTER: + case R600::CF_ALU_BREAK: + case R600::CF_ALU_CONTINUE: if (CurrentSubEntries == 0) return false; if (ST->getWavefrontSize() == 64) { @@ -136,7 +137,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { return 0; case CFStack::FIRST_NON_WQM_PUSH: assert(!ST->hasCaymanISA()); - if (ST->getGeneration() <= R600Subtarget::R700) { + if (ST->getGeneration() <= AMDGPUSubtarget::R700) { // +1 For the push operation. // +2 Extra space required. return 3; @@ -149,7 +150,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { return 2; } case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: - assert(ST->getGeneration() >= R600Subtarget::EVERGREEN); + assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); // +1 For the push operation. // +1 Extra space required. return 2; @@ -167,8 +168,8 @@ void CFStack::updateMaxStackSize() { void CFStack::pushBranch(unsigned Opcode, bool isWQM) { CFStack::StackItem Item = CFStack::ENTRY; switch(Opcode) { - case AMDGPU::CF_PUSH_EG: - case AMDGPU::CF_ALU_PUSH_BEFORE: + case R600::CF_PUSH_EG: + case R600::CF_ALU_PUSH_BEFORE: if (!isWQM) { if (!ST->hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) @@ -176,7 +177,7 @@ void CFStack::pushBranch(unsigned Opcode, bool isWQM) { // See comment in // CFStack::getSubEntrySize() else if (CurrentEntries > 0 && - ST->getGeneration() > R600Subtarget::EVERGREEN && + ST->getGeneration() > AMDGPUSubtarget::EVERGREEN && !ST->hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; @@ -239,8 +240,8 @@ private: bool IsTrivialInst(MachineInstr &MI) const { switch (MI.getOpcode()) { - case AMDGPU::KILL: - case AMDGPU::RETURN: + case R600::KILL: + case R600::RETURN: return true; default: return false; @@ -249,44 +250,44 @@ private: const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { unsigned Opcode = 0; - bool isEg = (ST->getGeneration() >= R600Subtarget::EVERGREEN); + bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); switch (CFI) { case CF_TC: - Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; + Opcode = isEg ? R600::CF_TC_EG : R600::CF_TC_R600; break; case CF_VC: - Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600; + Opcode = isEg ? R600::CF_VC_EG : R600::CF_VC_R600; break; case CF_CALL_FS: - Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600; + Opcode = isEg ? R600::CF_CALL_FS_EG : R600::CF_CALL_FS_R600; break; case CF_WHILE_LOOP: - Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600; + Opcode = isEg ? R600::WHILE_LOOP_EG : R600::WHILE_LOOP_R600; break; case CF_END_LOOP: - Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600; + Opcode = isEg ? R600::END_LOOP_EG : R600::END_LOOP_R600; break; case CF_LOOP_BREAK: - Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600; + Opcode = isEg ? R600::LOOP_BREAK_EG : R600::LOOP_BREAK_R600; break; case CF_LOOP_CONTINUE: - Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600; + Opcode = isEg ? R600::CF_CONTINUE_EG : R600::CF_CONTINUE_R600; break; case CF_JUMP: - Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600; + Opcode = isEg ? R600::CF_JUMP_EG : R600::CF_JUMP_R600; break; case CF_ELSE: - Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600; + Opcode = isEg ? R600::CF_ELSE_EG : R600::CF_ELSE_R600; break; case CF_POP: - Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; + Opcode = isEg ? R600::POP_EG : R600::POP_R600; break; case CF_END: if (ST->hasCaymanISA()) { - Opcode = AMDGPU::CF_END_CM; + Opcode = R600::CF_END_CM; break; } - Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600; + Opcode = isEg ? R600::CF_END_EG : R600::CF_END_R600; break; } assert (Opcode && "No opcode selected"); @@ -304,21 +305,21 @@ private: continue; if (MO.isDef()) { unsigned Reg = MO.getReg(); - if (AMDGPU::R600_Reg128RegClass.contains(Reg)) + if (R600::R600_Reg128RegClass.contains(Reg)) DstMI = Reg; else DstMI = TRI->getMatchingSuperReg(Reg, - TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), - &AMDGPU::R600_Reg128RegClass); + AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)), + &R600::R600_Reg128RegClass); } if (MO.isUse()) { unsigned Reg = MO.getReg(); - if (AMDGPU::R600_Reg128RegClass.contains(Reg)) + if (R600::R600_Reg128RegClass.contains(Reg)) SrcMI = Reg; else SrcMI = TRI->getMatchingSuperReg(Reg, - TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), - &AMDGPU::R600_Reg128RegClass); + AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)), + &R600::R600_Reg128RegClass); } } if ((DstRegs.find(SrcMI) == DstRegs.end())) { @@ -358,15 +359,15 @@ private: void getLiteral(MachineInstr &MI, std::vector<MachineOperand *> &Lits) const { static const unsigned LiteralRegs[] = { - AMDGPU::ALU_LITERAL_X, - AMDGPU::ALU_LITERAL_Y, - AMDGPU::ALU_LITERAL_Z, - AMDGPU::ALU_LITERAL_W + R600::ALU_LITERAL_X, + R600::ALU_LITERAL_Y, + R600::ALU_LITERAL_Z, + R600::ALU_LITERAL_W }; const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs = TII->getSrcs(MI); for (const auto &Src:Srcs) { - if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X) + if (Src.first->getReg() != R600::ALU_LITERAL_X) continue; int64_t Imm = Src.second; std::vector<MachineOperand *>::iterator It = @@ -376,7 +377,7 @@ private: // Get corresponding Operand MachineOperand &Operand = MI.getOperand( - TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal)); + TII->getOperandIdx(MI.getOpcode(), R600::OpName::literal)); if (It != Lits.end()) { // Reuse existing literal reg @@ -399,7 +400,7 @@ private: unsigned LiteralPair0 = Literals[i]; unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0; InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(), - TII->get(AMDGPU::LITERALS)) + TII->get(R600::LITERALS)) .addImm(LiteralPair0) .addImm(LiteralPair1); } @@ -441,7 +442,7 @@ private: } for (unsigned i = 0, e = Literals.size(); i < e; i += 2) { MachineInstrBuilder MILit = BuildMI(MBB, I, I->getDebugLoc(), - TII->get(AMDGPU::LITERALS)); + TII->get(R600::LITERALS)); if (Literals[i]->isImm()) { MILit.addImm(Literals[i]->getImm()); } else { @@ -470,7 +471,7 @@ private: unsigned &CfCount) { CounterPropagateAddr(*Clause.first, CfCount); MachineBasicBlock *BB = Clause.first->getParent(); - BuildMI(BB, DL, TII->get(AMDGPU::FETCH_CLAUSE)).addImm(CfCount); + BuildMI(BB, DL, TII->get(R600::FETCH_CLAUSE)).addImm(CfCount); for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { BB->splice(InsertPos, BB, Clause.second[i]); } @@ -482,7 +483,7 @@ private: Clause.first->getOperand(0).setImm(0); CounterPropagateAddr(*Clause.first, CfCount); MachineBasicBlock *BB = Clause.first->getParent(); - BuildMI(BB, DL, TII->get(AMDGPU::ALU_CLAUSE)).addImm(CfCount); + BuildMI(BB, DL, TII->get(R600::ALU_CLAUSE)).addImm(CfCount); for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { BB->splice(InsertPos, BB, Clause.second[i]); } @@ -531,7 +532,7 @@ public: for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) { - DEBUG(dbgs() << CfCount << ":"; I->dump();); + LLVM_DEBUG(dbgs() << CfCount << ":"; I->dump();); FetchClauses.push_back(MakeFetchClause(MBB, I)); CfCount++; LastAlu.back() = nullptr; @@ -539,33 +540,34 @@ public: } MachineBasicBlock::iterator MI = I; - if (MI->getOpcode() != AMDGPU::ENDIF) + if (MI->getOpcode() != R600::ENDIF) LastAlu.back() = nullptr; - if (MI->getOpcode() == AMDGPU::CF_ALU) + if (MI->getOpcode() == R600::CF_ALU) LastAlu.back() = &*MI; I++; bool RequiresWorkAround = CFStack.requiresWorkAroundForInst(MI->getOpcode()); switch (MI->getOpcode()) { - case AMDGPU::CF_ALU_PUSH_BEFORE: + case R600::CF_ALU_PUSH_BEFORE: if (RequiresWorkAround) { - DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n"); - BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) + LLVM_DEBUG(dbgs() + << "Applying bug work-around for ALU_PUSH_BEFORE\n"); + BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(R600::CF_PUSH_EG)) .addImm(CfCount + 1) .addImm(1); - MI->setDesc(TII->get(AMDGPU::CF_ALU)); + MI->setDesc(TII->get(R600::CF_ALU)); CfCount++; - CFStack.pushBranch(AMDGPU::CF_PUSH_EG); + CFStack.pushBranch(R600::CF_PUSH_EG); } else - CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); + CFStack.pushBranch(R600::CF_ALU_PUSH_BEFORE); LLVM_FALLTHROUGH; - case AMDGPU::CF_ALU: + case R600::CF_ALU: I = MI; AluClauses.push_back(MakeALUClause(MBB, I)); - DEBUG(dbgs() << CfCount << ":"; MI->dump();); + LLVM_DEBUG(dbgs() << CfCount << ":"; MI->dump();); CfCount++; break; - case AMDGPU::WHILELOOP: { + case R600::WHILELOOP: { CFStack.pushLoop(); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_WHILE_LOOP)) @@ -578,7 +580,7 @@ public: CfCount++; break; } - case AMDGPU::ENDLOOP: { + case R600::ENDLOOP: { CFStack.popLoop(); std::pair<unsigned, std::set<MachineInstr *>> Pair = std::move(LoopStack.back()); @@ -590,19 +592,19 @@ public: CfCount++; break; } - case AMDGPU::IF_PREDICATE_SET: { + case R600::IF_PREDICATE_SET: { LastAlu.push_back(nullptr); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_JUMP)) .addImm(0) .addImm(0); IfThenElseStack.push_back(MIb); - DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + LLVM_DEBUG(dbgs() << CfCount << ":"; MIb->dump();); MI->eraseFromParent(); CfCount++; break; } - case AMDGPU::ELSE: { + case R600::ELSE: { MachineInstr * JumpInst = IfThenElseStack.back(); IfThenElseStack.pop_back(); CounterPropagateAddr(*JumpInst, CfCount); @@ -610,13 +612,13 @@ public: getHWInstrDesc(CF_ELSE)) .addImm(0) .addImm(0); - DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + LLVM_DEBUG(dbgs() << CfCount << ":"; MIb->dump();); IfThenElseStack.push_back(MIb); MI->eraseFromParent(); CfCount++; break; } - case AMDGPU::ENDIF: { + case R600::ENDIF: { CFStack.popBranch(); if (LastAlu.back()) { ToPopAfter.push_back(LastAlu.back()); @@ -626,7 +628,7 @@ public: .addImm(CfCount + 1) .addImm(1); (void)MIb; - DEBUG(dbgs() << CfCount << ":"; MIb->dump();); + LLVM_DEBUG(dbgs() << CfCount << ":"; MIb->dump();); CfCount++; } @@ -638,7 +640,7 @@ public: MI->eraseFromParent(); break; } - case AMDGPU::BREAK: { + case R600::BREAK: { CfCount ++; MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_LOOP_BREAK)) @@ -647,7 +649,7 @@ public: MI->eraseFromParent(); break; } - case AMDGPU::CONTINUE: { + case R600::CONTINUE: { MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_LOOP_CONTINUE)) .addImm(0); @@ -656,12 +658,12 @@ public: CfCount++; break; } - case AMDGPU::RETURN: { + case R600::RETURN: { DebugLoc DL = MBB.findDebugLoc(MI); BuildMI(MBB, MI, DL, getHWInstrDesc(CF_END)); CfCount++; if (CfCount % 2) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::PAD)); + BuildMI(MBB, I, DL, TII->get(R600::PAD)); CfCount++; } MI->eraseFromParent(); @@ -673,7 +675,7 @@ public: } default: if (TII->isExport(MI->getOpcode())) { - DEBUG(dbgs() << CfCount << ":"; MI->dump();); + LLVM_DEBUG(dbgs() << CfCount << ":"; MI->dump();); CfCount++; } break; @@ -682,7 +684,7 @@ public: for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) { MachineInstr *Alu = ToPopAfter[i]; BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu), - TII->get(AMDGPU::CF_ALU_POP_AFTER)) + TII->get(R600::CF_ALU_POP_AFTER)) .addImm(Alu->getOperand(0).getImm()) .addImm(Alu->getOperand(1).getImm()) .addImm(Alu->getOperand(2).getImm()) diff --git a/lib/Target/AMDGPU/R600Defines.h b/lib/Target/AMDGPU/R600Defines.h index 534461adc59f..0d33d82e8e0f 100644 --- a/lib/Target/AMDGPU/R600Defines.h +++ b/lib/Target/AMDGPU/R600Defines.h @@ -23,7 +23,7 @@ #define MO_FLAG_LAST (1 << 6) #define NUM_MO_FLAGS 7 -/// \brief Helper for getting the operand index for the instruction flags +/// Helper for getting the operand index for the instruction flags /// operand. #define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3) @@ -52,7 +52,7 @@ namespace R600_InstFlag { #define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS) -/// \brief Defines for extracting register information from register encoding +/// Defines for extracting register information from register encoding #define HW_REG_MASK 0x1ff #define HW_CHAN_SHIFT 9 diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp index 0d8ccd088ec4..1683fe6c9a57 100644 --- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp +++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -19,6 +19,7 @@ #include "R600Defines.h" #include "R600InstrInfo.h" #include "R600RegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -51,12 +52,12 @@ private: unsigned OccupiedDwords(MachineInstr &MI) const { switch (MI.getOpcode()) { - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT_4: + case R600::INTERP_PAIR_XY: + case R600::INTERP_PAIR_ZW: + case R600::INTERP_VEC_LOAD: + case R600::DOT_4: return 4; - case AMDGPU::KILL: + case R600::KILL: return 0; default: break; @@ -76,7 +77,7 @@ private: E = MI.operands_end(); It != E; ++It) { MachineOperand &MO = *It; - if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) + if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X) ++NumLiteral; } return 1 + NumLiteral; @@ -88,12 +89,12 @@ private: if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode())) return true; switch (MI.getOpcode()) { - case AMDGPU::PRED_X: - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::COPY: - case AMDGPU::DOT_4: + case R600::PRED_X: + case R600::INTERP_PAIR_XY: + case R600::INTERP_PAIR_ZW: + case R600::INTERP_VEC_LOAD: + case R600::COPY: + case R600::DOT_4: return true; default: return false; @@ -102,9 +103,9 @@ private: bool IsTrivialInst(MachineInstr &MI) const { switch (MI.getOpcode()) { - case AMDGPU::KILL: - case AMDGPU::RETURN: - case AMDGPU::IMPLICIT_DEF: + case R600::KILL: + case R600::RETURN: + case R600::IMPLICIT_DEF: return true; default: return false; @@ -131,16 +132,16 @@ private: bool UpdateInstr = true) const { std::vector<std::pair<unsigned, unsigned>> UsedKCache; - if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != AMDGPU::DOT_4) + if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != R600::DOT_4) return true; const SmallVectorImpl<std::pair<MachineOperand *, int64_t>> &Consts = TII->getSrcs(MI); assert( - (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == AMDGPU::DOT_4) && + (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == R600::DOT_4) && "Can't assign Const"); for (unsigned i = 0, n = Consts.size(); i < n; ++i) { - if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) + if (Consts[i].first->getReg() != R600::ALU_CONST) continue; unsigned Sel = Consts[i].second; unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31; @@ -171,16 +172,16 @@ private: return true; for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) { - if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) + if (Consts[i].first->getReg() != R600::ALU_CONST) continue; switch(UsedKCache[j].first) { case 0: Consts[i].first->setReg( - AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second)); + R600::R600_KC0RegClass.getRegister(UsedKCache[j].second)); break; case 1: Consts[i].first->setReg( - AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second)); + R600::R600_KC1RegClass.getRegister(UsedKCache[j].second)); break; default: llvm_unreachable("Wrong Cache Line"); @@ -252,7 +253,7 @@ private: break; if (AluInstCount > TII->getMaxAlusPerClause()) break; - if (I->getOpcode() == AMDGPU::PRED_X) { + if (I->getOpcode() == R600::PRED_X) { // We put PRED_X in its own clause to ensure that ifcvt won't create // clauses with more than 128 insts. // IfCvt is indeed checking that "then" and "else" branches of an if @@ -288,7 +289,7 @@ private: AluInstCount += OccupiedDwords(*I); } unsigned Opcode = PushBeforeModifier ? - AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU; + R600::CF_ALU_PUSH_BEFORE : R600::CF_ALU; BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode)) // We don't use the ADDR field until R600ControlFlowFinalizer pass, where // it is safe to assume it is 0. However if we always put 0 here, the ifcvt @@ -321,7 +322,7 @@ public: BB != BB_E; ++BB) { MachineBasicBlock &MBB = *BB; MachineBasicBlock::iterator I = MBB.begin(); - if (I != MBB.end() && I->getOpcode() == AMDGPU::CF_ALU) + if (I != MBB.end() && I->getOpcode() == R600::CF_ALU) continue; // BB was already parsed for (MachineBasicBlock::iterator E = MBB.end(); I != E;) { if (isALU(*I)) { diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp index ffea231ee4d0..b924ff019dd1 100644 --- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -21,6 +21,7 @@ #include "R600RegisterInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -95,16 +96,16 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { // Expand LDS_*_RET instructions if (TII->isLDSRetInstr(MI.getOpcode())) { - int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); + int DstIdx = TII->getOperandIdx(MI.getOpcode(), R600::OpName::dst); assert(DstIdx != -1); MachineOperand &DstOp = MI.getOperand(DstIdx); MachineInstr *Mov = TII->buildMovInstr(&MBB, I, - DstOp.getReg(), AMDGPU::OQAP); - DstOp.setReg(AMDGPU::OQAP); + DstOp.getReg(), R600::OQAP); + DstOp.setReg(R600::OQAP); int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(), - AMDGPU::OpName::pred_sel); + R600::OpName::pred_sel); int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(), - AMDGPU::OpName::pred_sel); + R600::OpName::pred_sel); // Copy the pred_sel bit Mov->getOperand(MovPredSelIdx).setReg( MI.getOperand(LDSPredSelIdx).getReg()); @@ -113,7 +114,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { switch (MI.getOpcode()) { default: break; // Expand PRED_X to one of the PRED_SET instructions. - case AMDGPU::PRED_X: { + case R600::PRED_X: { uint64_t Flags = MI.getOperand(3).getImm(); // The native opcode used by PRED_X is stored as an immediate in the // third operand. @@ -121,17 +122,18 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { MI.getOperand(2).getImm(), // opcode MI.getOperand(0).getReg(), // dst MI.getOperand(1).getReg(), // src0 - AMDGPU::ZERO); // src1 + R600::ZERO); // src1 TII->addFlag(*PredSet, 0, MO_FLAG_MASK); if (Flags & MO_FLAG_PUSH) { - TII->setImmOperand(*PredSet, AMDGPU::OpName::update_exec_mask, 1); + TII->setImmOperand(*PredSet, R600::OpName::update_exec_mask, 1); } else { - TII->setImmOperand(*PredSet, AMDGPU::OpName::update_pred, 1); + TII->setImmOperand(*PredSet, R600::OpName::update_pred, 1); } MI.eraseFromParent(); continue; } - case AMDGPU::DOT_4: { + case R600::DOT_4: { + const R600RegisterInfo &TRI = TII->getRegisterInfo(); unsigned DstReg = MI.getOperand(0).getReg(); @@ -140,7 +142,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { for (unsigned Chan = 0; Chan < 4; ++Chan) { bool Mask = (Chan != TRI.getHWRegChan(DstReg)); unsigned SubDstReg = - AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); + R600::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); MachineInstr *BMI = TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg); if (Chan > 0) { @@ -155,10 +157,10 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { // While not strictly necessary from hw point of view, we force // all src operands of a dot4 inst to belong to the same slot. unsigned Src0 = BMI->getOperand( - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0)) + TII->getOperandIdx(Opcode, R600::OpName::src0)) .getReg(); unsigned Src1 = BMI->getOperand( - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1)) + TII->getOperandIdx(Opcode, R600::OpName::src1)) .getReg(); (void) Src0; (void) Src1; @@ -205,26 +207,26 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { // T0_W = CUBE T1_Y, T1_Z for (unsigned Chan = 0; Chan < 4; Chan++) { unsigned DstReg = MI.getOperand( - TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg(); + TII->getOperandIdx(MI, R600::OpName::dst)).getReg(); unsigned Src0 = MI.getOperand( - TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg(); + TII->getOperandIdx(MI, R600::OpName::src0)).getReg(); unsigned Src1 = 0; // Determine the correct source registers if (!IsCube) { - int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1); + int Src1Idx = TII->getOperandIdx(MI, R600::OpName::src1); if (Src1Idx != -1) { Src1 = MI.getOperand(Src1Idx).getReg(); } } if (IsReduction) { - unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); + unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(Chan); Src0 = TRI.getSubReg(Src0, SubRegIndex); Src1 = TRI.getSubReg(Src1, SubRegIndex); } else if (IsCube) { static const int CubeSrcSwz[] = {2, 2, 0, 1}; - unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]); - unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]); + unsigned SubRegIndex0 = AMDGPURegisterInfo::getSubRegFromChannel(CubeSrcSwz[Chan]); + unsigned SubRegIndex1 = AMDGPURegisterInfo::getSubRegFromChannel(CubeSrcSwz[3 - Chan]); Src1 = TRI.getSubReg(Src0, SubRegIndex1); Src0 = TRI.getSubReg(Src0, SubRegIndex0); } @@ -233,14 +235,14 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { bool Mask = false; bool NotLast = true; if (IsCube) { - unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); + unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(Chan); DstReg = TRI.getSubReg(DstReg, SubRegIndex); } else { // Mask the write if the original instruction does not write to // the current Channel. Mask = (Chan != TRI.getHWRegChan(DstReg)); unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; - DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); + DstReg = R600::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); } // Set the IsLast bit @@ -249,11 +251,11 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { // Add the new instruction unsigned Opcode = MI.getOpcode(); switch (Opcode) { - case AMDGPU::CUBE_r600_pseudo: - Opcode = AMDGPU::CUBE_r600_real; + case R600::CUBE_r600_pseudo: + Opcode = R600::CUBE_r600_real; break; - case AMDGPU::CUBE_eg_pseudo: - Opcode = AMDGPU::CUBE_eg_real; + case R600::CUBE_eg_pseudo: + Opcode = R600::CUBE_eg_real; break; default: break; @@ -270,12 +272,12 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { if (NotLast) { TII->addFlag(*NewMI, 0, MO_FLAG_NOT_LAST); } - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg); + SetFlagInNewMI(NewMI, &MI, R600::OpName::clamp); + SetFlagInNewMI(NewMI, &MI, R600::OpName::literal); + SetFlagInNewMI(NewMI, &MI, R600::OpName::src0_abs); + SetFlagInNewMI(NewMI, &MI, R600::OpName::src1_abs); + SetFlagInNewMI(NewMI, &MI, R600::OpName::src0_neg); + SetFlagInNewMI(NewMI, &MI, R600::OpName::src1_neg); } MI.eraseFromParent(); } diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 66291d0be4e6..113d6249fa60 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -8,18 +8,18 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Custom DAG lowering for R600 +/// Custom DAG lowering for R600 // //===----------------------------------------------------------------------===// #include "R600ISelLowering.h" #include "AMDGPUFrameLowering.h" -#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600FrameLowering.h" #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" @@ -35,13 +35,13 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MachineValueType.h" #include <cassert> #include <cstdint> #include <iterator> @@ -50,17 +50,19 @@ using namespace llvm; +#include "R600GenCallingConv.inc" + R600TargetLowering::R600TargetLowering(const TargetMachine &TM, const R600Subtarget &STI) - : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) { - addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); - addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); - addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); - addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); - addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); - addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); + : AMDGPUTargetLowering(TM, STI), Subtarget(&STI), Gen(STI.getGeneration()) { + addRegisterClass(MVT::f32, &R600::R600_Reg32RegClass); + addRegisterClass(MVT::i32, &R600::R600_Reg32RegClass); + addRegisterClass(MVT::v2f32, &R600::R600_Reg64RegClass); + addRegisterClass(MVT::v2i32, &R600::R600_Reg64RegClass); + addRegisterClass(MVT::v4f32, &R600::R600_Reg128RegClass); + addRegisterClass(MVT::v4i32, &R600::R600_Reg128RegClass); - computeRegisterProperties(STI.getRegisterInfo()); + computeRegisterProperties(Subtarget->getRegisterInfo()); // Legalize loads and stores to the private address space. setOperationAction(ISD::LOAD, MVT::i32, Custom); @@ -147,6 +149,11 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSUB, MVT::f32, Expand); + setOperationAction(ISD::FCEIL, MVT::f64, Custom); + setOperationAction(ISD::FTRUNC, MVT::f64, Custom); + setOperationAction(ISD::FRINT, MVT::f64, Custom); + setOperationAction(ISD::FFLOOR, MVT::f64, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); @@ -216,6 +223,34 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMA, MVT::f64, Expand); } + // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we + // need it for R600. + if (!Subtarget->hasFP32Denormals()) + setOperationAction(ISD::FMAD, MVT::f32, Legal); + + if (!Subtarget->hasBFI()) { + // fcopysign can be done in a single instruction with BFI. + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + } + + if (!Subtarget->hasBCNT(32)) + setOperationAction(ISD::CTPOP, MVT::i32, Expand); + + if (!Subtarget->hasBCNT(64)) + setOperationAction(ISD::CTPOP, MVT::i64, Expand); + + if (Subtarget->hasFFBH()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); + + if (Subtarget->hasFFBL()) + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom); + + // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we + // need it for R600. + if (Subtarget->hasBFE()) + setHasExtractBitsInsn(true); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; @@ -245,14 +280,10 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::LOAD); } -const R600Subtarget *R600TargetLowering::getSubtarget() const { - return static_cast<const R600Subtarget *>(Subtarget); -} - static inline bool isEOP(MachineBasicBlock::iterator I) { if (std::next(I) == I->getParent()->end()) return false; - return std::next(I)->getOpcode() == AMDGPU::RETURN; + return std::next(I)->getOpcode() == R600::RETURN; } MachineBasicBlock * @@ -261,24 +292,24 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock::iterator I = MI; - const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); + const R600InstrInfo *TII = Subtarget->getInstrInfo(); switch (MI.getOpcode()) { default: // Replace LDS_*_RET instruction that don't have any uses with the // equivalent LDS_*_NORET instruction. if (TII->isLDSRetInstr(MI.getOpcode())) { - int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); + int DstIdx = TII->getOperandIdx(MI.getOpcode(), R600::OpName::dst); assert(DstIdx != -1); MachineInstrBuilder NewMI; // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add // LDS_1A2D support and remove this special case. if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) || - MI.getOpcode() == AMDGPU::LDS_CMPST_RET) + MI.getOpcode() == R600::LDS_CMPST_RET) return BB; NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), - TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode()))); + TII->get(R600::getLDSNoRetOp(MI.getOpcode()))); for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) { NewMI.add(MI.getOperand(i)); } @@ -286,31 +317,24 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } break; - case AMDGPU::CLAMP_R600: { - MachineInstr *NewMI = TII->buildDefaultInstruction( - *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), - MI.getOperand(1).getReg()); - TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP); - break; - } - case AMDGPU::FABS_R600: { + case R600::FABS_R600: { MachineInstr *NewMI = TII->buildDefaultInstruction( - *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), + *BB, I, R600::MOV, MI.getOperand(0).getReg(), MI.getOperand(1).getReg()); TII->addFlag(*NewMI, 0, MO_FLAG_ABS); break; } - case AMDGPU::FNEG_R600: { + case R600::FNEG_R600: { MachineInstr *NewMI = TII->buildDefaultInstruction( - *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), + *BB, I, R600::MOV, MI.getOperand(0).getReg(), MI.getOperand(1).getReg()); TII->addFlag(*NewMI, 0, MO_FLAG_NEG); break; } - case AMDGPU::MASK_WRITE: { + case R600::MASK_WRITE: { unsigned maskedRegister = MI.getOperand(0).getReg(); assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); @@ -318,7 +342,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, break; } - case AMDGPU::MOV_IMM_F32: + case R600::MOV_IMM_F32: TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1) .getFPImm() ->getValueAPF() @@ -326,39 +350,39 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .getZExtValue()); break; - case AMDGPU::MOV_IMM_I32: + case R600::MOV_IMM_I32: TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1).getImm()); break; - case AMDGPU::MOV_IMM_GLOBAL_ADDR: { + case R600::MOV_IMM_GLOBAL_ADDR: { //TODO: Perhaps combine this instruction with the next if possible auto MIB = TII->buildDefaultInstruction( - *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X); - int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal); + *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_LITERAL_X); + int Idx = TII->getOperandIdx(*MIB, R600::OpName::literal); //TODO: Ugh this is rather ugly MIB->getOperand(Idx) = MI.getOperand(1); break; } - case AMDGPU::CONST_COPY: { + case R600::CONST_COPY: { MachineInstr *NewMI = TII->buildDefaultInstruction( - *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST); - TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel, + *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_CONST); + TII->setImmOperand(*NewMI, R600::OpName::src0_sel, MI.getOperand(1).getImm()); break; } - case AMDGPU::RAT_WRITE_CACHELESS_32_eg: - case AMDGPU::RAT_WRITE_CACHELESS_64_eg: - case AMDGPU::RAT_WRITE_CACHELESS_128_eg: + case R600::RAT_WRITE_CACHELESS_32_eg: + case R600::RAT_WRITE_CACHELESS_64_eg: + case R600::RAT_WRITE_CACHELESS_128_eg: BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) .add(MI.getOperand(0)) .add(MI.getOperand(1)) .addImm(isEOP(I)); // Set End of program bit break; - case AMDGPU::RAT_STORE_TYPED_eg: + case R600::RAT_STORE_TYPED_eg: BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) .add(MI.getOperand(0)) .add(MI.getOperand(1)) @@ -366,49 +390,49 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addImm(isEOP(I)); // Set End of program bit break; - case AMDGPU::BRANCH: - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) + case R600::BRANCH: + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP)) .add(MI.getOperand(0)); break; - case AMDGPU::BRANCH_COND_f32: { + case R600::BRANCH_COND_f32: { MachineInstr *NewMI = - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), - AMDGPU::PREDICATE_BIT) + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X), + R600::PREDICATE_BIT) .add(MI.getOperand(1)) - .addImm(AMDGPU::PRED_SETNE) + .addImm(R600::PRED_SETNE) .addImm(0); // Flags TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND)) .add(MI.getOperand(0)) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + .addReg(R600::PREDICATE_BIT, RegState::Kill); break; } - case AMDGPU::BRANCH_COND_i32: { + case R600::BRANCH_COND_i32: { MachineInstr *NewMI = - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), - AMDGPU::PREDICATE_BIT) + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X), + R600::PREDICATE_BIT) .add(MI.getOperand(1)) - .addImm(AMDGPU::PRED_SETNE_INT) + .addImm(R600::PRED_SETNE_INT) .addImm(0); // Flags TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND)) .add(MI.getOperand(0)) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + .addReg(R600::PREDICATE_BIT, RegState::Kill); break; } - case AMDGPU::EG_ExportSwz: - case AMDGPU::R600_ExportSwz: { + case R600::EG_ExportSwz: + case R600::R600_ExportSwz: { // Instruction is left unmodified if its not the last one of its type bool isLastInstructionOfItsType = true; unsigned InstExportType = MI.getOperand(1).getImm(); for (MachineBasicBlock::iterator NextExportInst = std::next(I), EndBlock = BB->end(); NextExportInst != EndBlock; NextExportInst = std::next(NextExportInst)) { - if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || - NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { + if (NextExportInst->getOpcode() == R600::EG_ExportSwz || + NextExportInst->getOpcode() == R600::R600_ExportSwz) { unsigned CurrentInstExportType = NextExportInst->getOperand(1) .getImm(); if (CurrentInstExportType == InstExportType) { @@ -420,7 +444,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, bool EOP = isEOP(I); if (!EOP && !isLastInstructionOfItsType) return BB; - unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40; + unsigned CfInst = (MI.getOpcode() == R600::EG_ExportSwz) ? 84 : 40; BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) .add(MI.getOperand(0)) .add(MI.getOperand(1)) @@ -433,7 +457,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addImm(EOP); break; } - case AMDGPU::RETURN: { + case R600::RETURN: { return BB; } } @@ -478,7 +502,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); switch (IntrinsicID) { - case AMDGPUIntrinsic::r600_store_swizzle: { + case Intrinsic::r600_store_swizzle: { SDLoc DL(Op); const SDValue Args[8] = { Chain, @@ -505,14 +529,14 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const EVT VT = Op.getValueType(); SDLoc DL(Op); switch (IntrinsicID) { - case AMDGPUIntrinsic::r600_tex: - case AMDGPUIntrinsic::r600_texc: { + case Intrinsic::r600_tex: + case Intrinsic::r600_texc: { unsigned TextureOp; switch (IntrinsicID) { - case AMDGPUIntrinsic::r600_tex: + case Intrinsic::r600_tex: TextureOp = 0; break; - case AMDGPUIntrinsic::r600_texc: + case Intrinsic::r600_texc: TextureOp = 1; break; default: @@ -542,7 +566,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const }; return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); } - case AMDGPUIntrinsic::r600_dot4: { + case Intrinsic::r600_dot4: { SDValue Args[8] = { DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), DAG.getConstant(0, DL, MVT::i32)), @@ -566,7 +590,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case Intrinsic::r600_implicitarg_ptr: { MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUASI.PARAM_I_ADDRESS); - uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); + uint32_t ByteOffset = getImplicitParameterOffset(MF, FIRST_IMPLICIT); return DAG.getConstant(ByteOffset, DL, PtrVT); } case Intrinsic::r600_read_ngroups_x: @@ -589,23 +613,23 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const return LowerImplicitParameter(DAG, VT, DL, 8); case Intrinsic::r600_read_tgid_x: - return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T1_X, VT); + return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, + R600::T1_X, VT); case Intrinsic::r600_read_tgid_y: - return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T1_Y, VT); + return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, + R600::T1_Y, VT); case Intrinsic::r600_read_tgid_z: - return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T1_Z, VT); + return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, + R600::T1_Z, VT); case Intrinsic::r600_read_tidig_x: - return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T0_X, VT); + return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, + R600::T0_X, VT); case Intrinsic::r600_read_tidig_y: - return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T0_Y, VT); + return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, + R600::T0_Y, VT); case Intrinsic::r600_read_tidig_z: - return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T0_Z, VT); + return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, + R600::T0_Z, VT); case Intrinsic::r600_recipsqrt_ieee: return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); @@ -755,7 +779,7 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { SDValue TrigVal = DAG.getNode(TrigNode, DL, VT, DAG.getNode(ISD::FADD, DL, VT, FractPart, DAG.getConstantFP(-0.5, DL, MVT::f32))); - if (Gen >= R600Subtarget::R700) + if (Gen >= AMDGPUSubtarget::R700) return TrigVal; // On R600 hw, COS/SIN input must be between -Pi and Pi. return DAG.getNode(ISD::FMUL, DL, VT, TrigVal, @@ -1527,7 +1551,7 @@ SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue R600TargetLowering::lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); + const R600FrameLowering *TFL = Subtarget->getFrameLowering(); FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); @@ -1539,6 +1563,28 @@ SDValue R600TargetLowering::lowerFrameIndex(SDValue Op, Op.getValueType()); } +CCAssignFn *R600TargetLowering::CCAssignFnForCall(CallingConv::ID CC, + bool IsVarArg) const { + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + case CallingConv::C: + case CallingConv::Fast: + case CallingConv::Cold: + llvm_unreachable("kernels should not be handled here"); + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_ES: + case CallingConv::AMDGPU_LS: + return CC_R600; + default: + report_fatal_error("Unsupported calling convention."); + } +} + /// XXX Only kernel functions are supported, so we can assume for now that /// every function is a kernel function, but in the future we should use /// separate calling conventions for kernel and non-kernel functions. @@ -1550,8 +1596,6 @@ SDValue R600TargetLowering::LowerFormalArguments( CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); MachineFunction &MF = DAG.getMachineFunction(); - R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); - SmallVector<ISD::InputArg, 8> LocalIns; if (AMDGPU::isShader(CallConv)) { @@ -1571,7 +1615,7 @@ SDValue R600TargetLowering::LowerFormalArguments( } if (AMDGPU::isShader(CallConv)) { - unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); + unsigned Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass); SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); InVals.push_back(Register); continue; @@ -1602,19 +1646,18 @@ SDValue R600TargetLowering::LowerFormalArguments( unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); unsigned PartOffset = VA.getLocMemOffset(); - unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + VA.getLocMemOffset(); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); SDValue Arg = DAG.getLoad( ISD::UNINDEXED, Ext, VT, DL, Chain, - DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo, + DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), + PtrInfo, MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); // 4 is the preferred alignment for the CONSTANT memory space. InVals.push_back(Arg); - MFI->setABIArgOffset(Offset + MemVT.getStoreSize()); } return Chain; } @@ -1989,26 +2032,26 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) const { - const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); + const R600InstrInfo *TII = Subtarget->getInstrInfo(); if (!Src.isMachineOpcode()) return false; switch (Src.getMachineOpcode()) { - case AMDGPU::FNEG_R600: + case R600::FNEG_R600: if (!Neg.getNode()) return false; Src = Src.getOperand(0); Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); return true; - case AMDGPU::FABS_R600: + case R600::FABS_R600: if (!Abs.getNode()) return false; Src = Src.getOperand(0); Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); return true; - case AMDGPU::CONST_COPY: { + case R600::CONST_COPY: { unsigned Opcode = ParentNode->getMachineOpcode(); - bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; + bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1; if (!Sel.getNode()) return false; @@ -2019,17 +2062,17 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx, // Gather constants values int SrcIndices[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src2), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) + TII->getOperandIdx(Opcode, R600::OpName::src0), + TII->getOperandIdx(Opcode, R600::OpName::src1), + TII->getOperandIdx(Opcode, R600::OpName::src2), + TII->getOperandIdx(Opcode, R600::OpName::src0_X), + TII->getOperandIdx(Opcode, R600::OpName::src0_Y), + TII->getOperandIdx(Opcode, R600::OpName::src0_Z), + TII->getOperandIdx(Opcode, R600::OpName::src0_W), + TII->getOperandIdx(Opcode, R600::OpName::src1_X), + TII->getOperandIdx(Opcode, R600::OpName::src1_Y), + TII->getOperandIdx(Opcode, R600::OpName::src1_Z), + TII->getOperandIdx(Opcode, R600::OpName::src1_W) }; std::vector<unsigned> Consts; for (int OtherSrcIdx : SrcIndices) { @@ -2042,7 +2085,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx, } if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) { - if (Reg->getReg() == AMDGPU::ALU_CONST) { + if (Reg->getReg() == R600::ALU_CONST) { ConstantSDNode *Cst = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx)); Consts.push_back(Cst->getZExtValue()); @@ -2057,30 +2100,30 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx, } Sel = CstOffset; - Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); + Src = DAG.getRegister(R600::ALU_CONST, MVT::f32); return true; } - case AMDGPU::MOV_IMM_GLOBAL_ADDR: + case R600::MOV_IMM_GLOBAL_ADDR: // Check if the Imm slot is used. Taken from below. if (cast<ConstantSDNode>(Imm)->getZExtValue()) return false; Imm = Src.getOperand(0); - Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32); + Src = DAG.getRegister(R600::ALU_LITERAL_X, MVT::i32); return true; - case AMDGPU::MOV_IMM_I32: - case AMDGPU::MOV_IMM_F32: { - unsigned ImmReg = AMDGPU::ALU_LITERAL_X; + case R600::MOV_IMM_I32: + case R600::MOV_IMM_F32: { + unsigned ImmReg = R600::ALU_LITERAL_X; uint64_t ImmValue = 0; - if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) { + if (Src.getMachineOpcode() == R600::MOV_IMM_F32) { ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0)); float FloatValue = FPC->getValueAPF().convertToFloat(); if (FloatValue == 0.0) { - ImmReg = AMDGPU::ZERO; + ImmReg = R600::ZERO; } else if (FloatValue == 0.5) { - ImmReg = AMDGPU::HALF; + ImmReg = R600::HALF; } else if (FloatValue == 1.0) { - ImmReg = AMDGPU::ONE; + ImmReg = R600::ONE; } else { ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); } @@ -2088,9 +2131,9 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx, ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0)); uint64_t Value = C->getZExtValue(); if (Value == 0) { - ImmReg = AMDGPU::ZERO; + ImmReg = R600::ZERO; } else if (Value == 1) { - ImmReg = AMDGPU::ONE_INT; + ImmReg = R600::ONE_INT; } else { ImmValue = Value; } @@ -2099,7 +2142,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx, // Check that we aren't already using an immediate. // XXX: It's possible for an instruction to have more than one // immediate operand, but this is not supported yet. - if (ImmReg == AMDGPU::ALU_LITERAL_X) { + if (ImmReg == R600::ALU_LITERAL_X) { if (!Imm.getNode()) return false; ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm); @@ -2116,10 +2159,10 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx, } } -/// \brief Fold the instructions after selecting them +/// Fold the instructions after selecting them SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { - const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); + const R600InstrInfo *TII = Subtarget->getInstrInfo(); if (!Node->isMachineOpcode()) return Node; @@ -2128,36 +2171,36 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, std::vector<SDValue> Ops(Node->op_begin(), Node->op_end()); - if (Opcode == AMDGPU::DOT_4) { + if (Opcode == R600::DOT_4) { int OperandIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) + TII->getOperandIdx(Opcode, R600::OpName::src0_X), + TII->getOperandIdx(Opcode, R600::OpName::src0_Y), + TII->getOperandIdx(Opcode, R600::OpName::src0_Z), + TII->getOperandIdx(Opcode, R600::OpName::src0_W), + TII->getOperandIdx(Opcode, R600::OpName::src1_X), + TII->getOperandIdx(Opcode, R600::OpName::src1_Y), + TII->getOperandIdx(Opcode, R600::OpName::src1_Z), + TII->getOperandIdx(Opcode, R600::OpName::src1_W) }; int NegIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W) + TII->getOperandIdx(Opcode, R600::OpName::src0_neg_X), + TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Y), + TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Z), + TII->getOperandIdx(Opcode, R600::OpName::src0_neg_W), + TII->getOperandIdx(Opcode, R600::OpName::src1_neg_X), + TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Y), + TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Z), + TII->getOperandIdx(Opcode, R600::OpName::src1_neg_W) }; int AbsIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W) + TII->getOperandIdx(Opcode, R600::OpName::src0_abs_X), + TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Y), + TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Z), + TII->getOperandIdx(Opcode, R600::OpName::src0_abs_W), + TII->getOperandIdx(Opcode, R600::OpName::src1_abs_X), + TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Y), + TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Z), + TII->getOperandIdx(Opcode, R600::OpName::src1_abs_W) }; for (unsigned i = 0; i < 8; i++) { if (OperandIdx[i] < 0) @@ -2165,7 +2208,7 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, SDValue &Src = Ops[OperandIdx[i] - 1]; SDValue &Neg = Ops[NegIdx[i] - 1]; SDValue &Abs = Ops[AbsIdx[i] - 1]; - bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; + bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1; int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); if (HasDst) SelIdx--; @@ -2173,42 +2216,28 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); } - } else if (Opcode == AMDGPU::REG_SEQUENCE) { + } else if (Opcode == R600::REG_SEQUENCE) { for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { SDValue &Src = Ops[i]; if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); } - } else if (Opcode == AMDGPU::CLAMP_R600) { - SDValue Src = Node->getOperand(0); - if (!Src.isMachineOpcode() || - !TII->hasInstrModifiers(Src.getMachineOpcode())) - return Node; - int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(), - AMDGPU::OpName::clamp); - if (ClampIdx < 0) - return Node; - SDLoc DL(Node); - std::vector<SDValue> Ops(Src->op_begin(), Src->op_end()); - Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32); - return DAG.getMachineNode(Src.getMachineOpcode(), DL, - Node->getVTList(), Ops); } else { if (!TII->hasInstrModifiers(Opcode)) return Node; int OperandIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src2) + TII->getOperandIdx(Opcode, R600::OpName::src0), + TII->getOperandIdx(Opcode, R600::OpName::src1), + TII->getOperandIdx(Opcode, R600::OpName::src2) }; int NegIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg) + TII->getOperandIdx(Opcode, R600::OpName::src0_neg), + TII->getOperandIdx(Opcode, R600::OpName::src1_neg), + TII->getOperandIdx(Opcode, R600::OpName::src2_neg) }; int AbsIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs), + TII->getOperandIdx(Opcode, R600::OpName::src0_abs), + TII->getOperandIdx(Opcode, R600::OpName::src1_abs), -1 }; for (unsigned i = 0; i < 3; i++) { @@ -2218,9 +2247,9 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, SDValue &Neg = Ops[NegIdx[i] - 1]; SDValue FakeAbs; SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; - bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; + bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1; int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); - int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal); + int ImmIdx = TII->getOperandIdx(Opcode, R600::OpName::literal); if (HasDst) { SelIdx--; ImmIdx--; diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h index 2a774693f02b..907d1f10e151 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.h +++ b/lib/Target/AMDGPU/R600ISelLowering.h @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief R600 DAG Lowering interface definition +/// R600 DAG Lowering interface definition // //===----------------------------------------------------------------------===// @@ -23,6 +23,8 @@ class R600InstrInfo; class R600Subtarget; class R600TargetLowering final : public AMDGPUTargetLowering { + + const R600Subtarget *Subtarget; public: R600TargetLowering(const TargetMachine &TM, const R600Subtarget &STI); @@ -36,6 +38,7 @@ public: void ReplaceNodeResults(SDNode * N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const override; + CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, diff --git a/lib/Target/AMDGPU/R600InstrFormats.td b/lib/Target/AMDGPU/R600InstrFormats.td index 61106ed42e64..687a9affa138 100644 --- a/lib/Target/AMDGPU/R600InstrFormats.td +++ b/lib/Target/AMDGPU/R600InstrFormats.td @@ -11,10 +11,10 @@ // //===----------------------------------------------------------------------===// -def isR600 : Predicate<"Subtarget->getGeneration() <= R600Subtarget::R700">; +def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">; def isR600toCayman : Predicate< - "Subtarget->getGeneration() <= R600Subtarget::NORTHERN_ISLANDS">; + "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">; class R600Pat<dag pattern, dag result> : AMDGPUPat<pattern, result> { let SubtargetPredicate = isR600toCayman; @@ -41,7 +41,7 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern, bit LDS_1A2D = 0; let SubtargetPredicate = isR600toCayman; - let Namespace = "AMDGPU"; + let Namespace = "R600"; let OutOperandList = outs; let InOperandList = ins; let AsmString = asm; diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp index 23e646c8147c..5397e779474c 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief R600 Implementation of TargetInstrInfo. +/// R600 Implementation of TargetInstrInfo. // //===----------------------------------------------------------------------===// @@ -19,6 +19,7 @@ #include "R600Defines.h" #include "R600FrameLowering.h" #include "R600RegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallSet.h" @@ -44,10 +45,15 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR -#include "AMDGPUGenDFAPacketizer.inc" +#include "R600GenDFAPacketizer.inc" + +#define GET_INSTRINFO_CTOR_DTOR +#define GET_INSTRMAP_INFO +#define GET_INSTRINFO_NAMED_OPS +#include "R600GenInstrInfo.inc" R600InstrInfo::R600InstrInfo(const R600Subtarget &ST) - : AMDGPUInstrInfo(ST), RI(), ST(ST) {} + : R600GenInstrInfo(-1, -1), RI(), ST(ST) {} bool R600InstrInfo::isVector(const MachineInstr &MI) const { return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR; @@ -58,31 +64,31 @@ void R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { unsigned VectorComponents = 0; - if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) || - AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) && - (AMDGPU::R600_Reg128RegClass.contains(SrcReg) || - AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) { + if ((R600::R600_Reg128RegClass.contains(DestReg) || + R600::R600_Reg128VerticalRegClass.contains(DestReg)) && + (R600::R600_Reg128RegClass.contains(SrcReg) || + R600::R600_Reg128VerticalRegClass.contains(SrcReg))) { VectorComponents = 4; - } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) || - AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) && - (AMDGPU::R600_Reg64RegClass.contains(SrcReg) || - AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) { + } else if((R600::R600_Reg64RegClass.contains(DestReg) || + R600::R600_Reg64VerticalRegClass.contains(DestReg)) && + (R600::R600_Reg64RegClass.contains(SrcReg) || + R600::R600_Reg64VerticalRegClass.contains(SrcReg))) { VectorComponents = 2; } if (VectorComponents > 0) { for (unsigned I = 0; I < VectorComponents; I++) { - unsigned SubRegIndex = RI.getSubRegFromChannel(I); - buildDefaultInstruction(MBB, MI, AMDGPU::MOV, + unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(I); + buildDefaultInstruction(MBB, MI, R600::MOV, RI.getSubReg(DestReg, SubRegIndex), RI.getSubReg(SrcReg, SubRegIndex)) .addReg(DestReg, RegState::Define | RegState::Implicit); } } else { - MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV, + MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, R600::MOV, DestReg, SrcReg); - NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0)) + NewMI->getOperand(getOperandIdx(*NewMI, R600::OpName::src0)) .setIsKill(KillSrc); } } @@ -103,9 +109,9 @@ bool R600InstrInfo::isMov(unsigned Opcode) const { switch(Opcode) { default: return false; - case AMDGPU::MOV: - case AMDGPU::MOV_IMM_F32: - case AMDGPU::MOV_IMM_I32: + case R600::MOV: + case R600::MOV_IMM_F32: + case R600::MOV_IMM_I32: return true; } } @@ -117,10 +123,10 @@ bool R600InstrInfo::isReductionOp(unsigned Opcode) const { bool R600InstrInfo::isCubeOp(unsigned Opcode) const { switch(Opcode) { default: return false; - case AMDGPU::CUBE_r600_pseudo: - case AMDGPU::CUBE_r600_real: - case AMDGPU::CUBE_eg_pseudo: - case AMDGPU::CUBE_eg_real: + case R600::CUBE_r600_pseudo: + case R600::CUBE_r600_real: + case R600::CUBE_eg_pseudo: + case R600::CUBE_eg_real: return true; } } @@ -148,7 +154,7 @@ bool R600InstrInfo::isLDSInstr(unsigned Opcode) const { } bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const { - return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1; + return isLDSInstr(Opcode) && getOperandIdx(Opcode, R600::OpName::dst) != -1; } bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const { @@ -157,12 +163,12 @@ bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const { if (isVector(MI) || isCubeOp(MI.getOpcode())) return true; switch (MI.getOpcode()) { - case AMDGPU::PRED_X: - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::COPY: - case AMDGPU::DOT_4: + case R600::PRED_X: + case R600::INTERP_PAIR_XY: + case R600::INTERP_PAIR_ZW: + case R600::INTERP_VEC_LOAD: + case R600::COPY: + case R600::DOT_4: return true; default: return false; @@ -172,7 +178,7 @@ bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const { bool R600InstrInfo::isTransOnly(unsigned Opcode) const { if (ST.hasCaymanISA()) return false; - return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU); + return (get(Opcode).getSchedClass() == R600::Sched::TransALU); } bool R600InstrInfo::isTransOnly(const MachineInstr &MI) const { @@ -180,7 +186,7 @@ bool R600InstrInfo::isTransOnly(const MachineInstr &MI) const { } bool R600InstrInfo::isVectorOnly(unsigned Opcode) const { - return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU); + return (get(Opcode).getSchedClass() == R600::Sched::VecALU); } bool R600InstrInfo::isVectorOnly(const MachineInstr &MI) const { @@ -214,8 +220,8 @@ bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const { bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { switch (Opcode) { - case AMDGPU::KILLGT: - case AMDGPU::GROUP_BARRIER: + case R600::KILLGT: + case R600::GROUP_BARRIER: return true; default: return false; @@ -223,11 +229,11 @@ bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { } bool R600InstrInfo::usesAddressRegister(MachineInstr &MI) const { - return MI.findRegisterUseOperandIdx(AMDGPU::AR_X) != -1; + return MI.findRegisterUseOperandIdx(R600::AR_X) != -1; } bool R600InstrInfo::definesAddressRegister(MachineInstr &MI) const { - return MI.findRegisterDefOperandIdx(AMDGPU::AR_X) != -1; + return MI.findRegisterDefOperandIdx(R600::AR_X) != -1; } bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const { @@ -241,7 +247,7 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const { TargetRegisterInfo::isVirtualRegister(I->getReg())) continue; - if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I->getReg())) + if (R600::R600_LDS_SRC_REGRegClass.contains(I->getReg())) return true; } return false; @@ -249,17 +255,17 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const { int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const { static const unsigned SrcSelTable[][2] = { - {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel}, - {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel}, - {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel}, - {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X}, - {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y}, - {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z}, - {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W}, - {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X}, - {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y}, - {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z}, - {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W} + {R600::OpName::src0, R600::OpName::src0_sel}, + {R600::OpName::src1, R600::OpName::src1_sel}, + {R600::OpName::src2, R600::OpName::src2_sel}, + {R600::OpName::src0_X, R600::OpName::src0_sel_X}, + {R600::OpName::src0_Y, R600::OpName::src0_sel_Y}, + {R600::OpName::src0_Z, R600::OpName::src0_sel_Z}, + {R600::OpName::src0_W, R600::OpName::src0_sel_W}, + {R600::OpName::src1_X, R600::OpName::src1_sel_X}, + {R600::OpName::src1_Y, R600::OpName::src1_sel_Y}, + {R600::OpName::src1_Z, R600::OpName::src1_sel_Z}, + {R600::OpName::src1_W, R600::OpName::src1_sel_W} }; for (const auto &Row : SrcSelTable) { @@ -274,23 +280,23 @@ SmallVector<std::pair<MachineOperand *, int64_t>, 3> R600InstrInfo::getSrcs(MachineInstr &MI) const { SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result; - if (MI.getOpcode() == AMDGPU::DOT_4) { + if (MI.getOpcode() == R600::DOT_4) { static const unsigned OpTable[8][2] = { - {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X}, - {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y}, - {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z}, - {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W}, - {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X}, - {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y}, - {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z}, - {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W}, + {R600::OpName::src0_X, R600::OpName::src0_sel_X}, + {R600::OpName::src0_Y, R600::OpName::src0_sel_Y}, + {R600::OpName::src0_Z, R600::OpName::src0_sel_Z}, + {R600::OpName::src0_W, R600::OpName::src0_sel_W}, + {R600::OpName::src1_X, R600::OpName::src1_sel_X}, + {R600::OpName::src1_Y, R600::OpName::src1_sel_Y}, + {R600::OpName::src1_Z, R600::OpName::src1_sel_Z}, + {R600::OpName::src1_W, R600::OpName::src1_sel_W}, }; for (unsigned j = 0; j < 8; j++) { MachineOperand &MO = MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0])); unsigned Reg = MO.getReg(); - if (Reg == AMDGPU::ALU_CONST) { + if (Reg == R600::ALU_CONST) { MachineOperand &Sel = MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1])); Result.push_back(std::make_pair(&MO, Sel.getImm())); @@ -302,9 +308,9 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const { } static const unsigned OpTable[3][2] = { - {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel}, - {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel}, - {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel}, + {R600::OpName::src0, R600::OpName::src0_sel}, + {R600::OpName::src1, R600::OpName::src1_sel}, + {R600::OpName::src2, R600::OpName::src2_sel}, }; for (unsigned j = 0; j < 3; j++) { @@ -313,15 +319,15 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const { break; MachineOperand &MO = MI.getOperand(SrcIdx); unsigned Reg = MO.getReg(); - if (Reg == AMDGPU::ALU_CONST) { + if (Reg == R600::ALU_CONST) { MachineOperand &Sel = MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1])); Result.push_back(std::make_pair(&MO, Sel.getImm())); continue; } - if (Reg == AMDGPU::ALU_LITERAL_X) { + if (Reg == R600::ALU_LITERAL_X) { MachineOperand &Operand = - MI.getOperand(getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal)); + MI.getOperand(getOperandIdx(MI.getOpcode(), R600::OpName::literal)); if (Operand.isImm()) { Result.push_back(std::make_pair(&MO, Operand.getImm())); continue; @@ -345,7 +351,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI, ++i; unsigned Reg = Src.first->getReg(); int Index = RI.getEncodingValue(Reg) & 0xff; - if (Reg == AMDGPU::OQAP) { + if (Reg == R600::OQAP) { Result.push_back(std::make_pair(Index, 0U)); } if (PV.find(Reg) != PV.end()) { @@ -435,7 +441,7 @@ unsigned R600InstrInfo::isLegalUpTo( const std::pair<int, unsigned> &Src = Srcs[j]; if (Src.first < 0 || Src.first == 255) continue; - if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) { + if (Src.first == GET_REG_INDEX(RI.getEncodingValue(R600::OQAP))) { if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 && Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) { // The value from output queue A (denoted by register OQAP) can @@ -541,7 +547,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG, for (unsigned i = 0, e = IG.size(); i < e; ++i) { IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount)); unsigned Op = getOperandIdx(IG[i]->getOpcode(), - AMDGPU::OpName::bank_swizzle); + R600::OpName::bank_swizzle); ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle) IG[i]->getOperand(Op).getImm()); } @@ -610,14 +616,14 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs) continue; for (const auto &Src : getSrcs(MI)) { - if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X) + if (Src.first->getReg() == R600::ALU_LITERAL_X) Literals.insert(Src.second); if (Literals.size() > 4) return false; - if (Src.first->getReg() == AMDGPU::ALU_CONST) + if (Src.first->getReg() == R600::ALU_CONST) Consts.push_back(Src.second); - if (AMDGPU::R600_KC0RegClass.contains(Src.first->getReg()) || - AMDGPU::R600_KC1RegClass.contains(Src.first->getReg())) { + if (R600::R600_KC0RegClass.contains(Src.first->getReg()) || + R600::R600_KC1RegClass.contains(Src.first->getReg())) { unsigned Index = RI.getEncodingValue(Src.first->getReg()) & 0xff; unsigned Chan = RI.getHWRegChan(Src.first->getReg()); Consts.push_back((Index << 2) | Chan); @@ -636,7 +642,7 @@ R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const { static bool isPredicateSetter(unsigned Opcode) { switch (Opcode) { - case AMDGPU::PRED_X: + case R600::PRED_X: return true; default: return false; @@ -658,12 +664,12 @@ findFirstPredicateSetterFrom(MachineBasicBlock &MBB, static bool isJump(unsigned Opcode) { - return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND; + return Opcode == R600::JUMP || Opcode == R600::JUMP_COND; } static bool isBranch(unsigned Opcode) { - return Opcode == AMDGPU::BRANCH || Opcode == AMDGPU::BRANCH_COND_i32 || - Opcode == AMDGPU::BRANCH_COND_f32; + return Opcode == R600::BRANCH || Opcode == R600::BRANCH_COND_i32 || + Opcode == R600::BRANCH_COND_f32; } bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB, @@ -678,7 +684,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB, if (I == MBB.end()) return false; - // AMDGPU::BRANCH* instructions are only available after isel and are not + // R600::BRANCH* instructions are only available after isel and are not // handled if (isBranch(I->getOpcode())) return true; @@ -687,7 +693,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB, } // Remove successive JUMP - while (I != MBB.begin() && std::prev(I)->getOpcode() == AMDGPU::JUMP) { + while (I != MBB.begin() && std::prev(I)->getOpcode() == R600::JUMP) { MachineBasicBlock::iterator PriorI = std::prev(I); if (AllowModify) I->removeFromParent(); @@ -698,10 +704,10 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB, // If there is only one terminator instruction, process it. unsigned LastOpc = LastInst.getOpcode(); if (I == MBB.begin() || !isJump((--I)->getOpcode())) { - if (LastOpc == AMDGPU::JUMP) { + if (LastOpc == R600::JUMP) { TBB = LastInst.getOperand(0).getMBB(); return false; - } else if (LastOpc == AMDGPU::JUMP_COND) { + } else if (LastOpc == R600::JUMP_COND) { auto predSet = I; while (!isPredicateSetter(predSet->getOpcode())) { predSet = --I; @@ -709,7 +715,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB, TBB = LastInst.getOperand(0).getMBB(); Cond.push_back(predSet->getOperand(1)); Cond.push_back(predSet->getOperand(2)); - Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); + Cond.push_back(MachineOperand::CreateReg(R600::PRED_SEL_ONE, false)); return false; } return true; // Can't handle indirect branch. @@ -720,7 +726,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB, unsigned SecondLastOpc = SecondLastInst.getOpcode(); // If the block ends with a B and a Bcc, handle it. - if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) { + if (SecondLastOpc == R600::JUMP_COND && LastOpc == R600::JUMP) { auto predSet = --I; while (!isPredicateSetter(predSet->getOpcode())) { predSet = --I; @@ -729,7 +735,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB, FBB = LastInst.getOperand(0).getMBB(); Cond.push_back(predSet->getOperand(1)); Cond.push_back(predSet->getOperand(2)); - Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); + Cond.push_back(MachineOperand::CreateReg(R600::PRED_SEL_ONE, false)); return false; } @@ -741,8 +747,8 @@ static MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) { for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend(); It != E; ++It) { - if (It->getOpcode() == AMDGPU::CF_ALU || - It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) + if (It->getOpcode() == R600::CF_ALU || + It->getOpcode() == R600::CF_ALU_PUSH_BEFORE) return It.getReverse(); } return MBB.end(); @@ -759,7 +765,7 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB, if (!FBB) { if (Cond.empty()) { - BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB); + BuildMI(&MBB, DL, get(R600::JUMP)).addMBB(TBB); return 1; } else { MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); @@ -767,14 +773,14 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB, addFlag(*PredSet, 0, MO_FLAG_PUSH); PredSet->getOperand(2).setImm(Cond[1].getImm()); - BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) + BuildMI(&MBB, DL, get(R600::JUMP_COND)) .addMBB(TBB) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + .addReg(R600::PREDICATE_BIT, RegState::Kill); MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); if (CfAlu == MBB.end()) return 1; - assert (CfAlu->getOpcode() == AMDGPU::CF_ALU); - CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE)); + assert (CfAlu->getOpcode() == R600::CF_ALU); + CfAlu->setDesc(get(R600::CF_ALU_PUSH_BEFORE)); return 1; } } else { @@ -782,15 +788,15 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB, assert(PredSet && "No previous predicate !"); addFlag(*PredSet, 0, MO_FLAG_PUSH); PredSet->getOperand(2).setImm(Cond[1].getImm()); - BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) + BuildMI(&MBB, DL, get(R600::JUMP_COND)) .addMBB(TBB) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); - BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB); + .addReg(R600::PREDICATE_BIT, RegState::Kill); + BuildMI(&MBB, DL, get(R600::JUMP)).addMBB(FBB); MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); if (CfAlu == MBB.end()) return 2; - assert (CfAlu->getOpcode() == AMDGPU::CF_ALU); - CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE)); + assert (CfAlu->getOpcode() == R600::CF_ALU); + CfAlu->setDesc(get(R600::CF_ALU_PUSH_BEFORE)); return 2; } } @@ -811,18 +817,18 @@ unsigned R600InstrInfo::removeBranch(MachineBasicBlock &MBB, switch (I->getOpcode()) { default: return 0; - case AMDGPU::JUMP_COND: { + case R600::JUMP_COND: { MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); clearFlag(*predSet, 0, MO_FLAG_PUSH); I->eraseFromParent(); MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); if (CfAlu == MBB.end()) break; - assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE); - CfAlu->setDesc(get(AMDGPU::CF_ALU)); + assert (CfAlu->getOpcode() == R600::CF_ALU_PUSH_BEFORE); + CfAlu->setDesc(get(R600::CF_ALU)); break; } - case AMDGPU::JUMP: + case R600::JUMP: I->eraseFromParent(); break; } @@ -836,18 +842,18 @@ unsigned R600InstrInfo::removeBranch(MachineBasicBlock &MBB, // FIXME: only one case?? default: return 1; - case AMDGPU::JUMP_COND: { + case R600::JUMP_COND: { MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); clearFlag(*predSet, 0, MO_FLAG_PUSH); I->eraseFromParent(); MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); if (CfAlu == MBB.end()) break; - assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE); - CfAlu->setDesc(get(AMDGPU::CF_ALU)); + assert (CfAlu->getOpcode() == R600::CF_ALU_PUSH_BEFORE); + CfAlu->setDesc(get(R600::CF_ALU)); break; } - case AMDGPU::JUMP: + case R600::JUMP: I->eraseFromParent(); break; } @@ -862,9 +868,9 @@ bool R600InstrInfo::isPredicated(const MachineInstr &MI) const { unsigned Reg = MI.getOperand(idx).getReg(); switch (Reg) { default: return false; - case AMDGPU::PRED_SEL_ONE: - case AMDGPU::PRED_SEL_ZERO: - case AMDGPU::PREDICATE_BIT: + case R600::PRED_SEL_ONE: + case R600::PRED_SEL_ZERO: + case R600::PREDICATE_BIT: return true; } } @@ -875,9 +881,9 @@ bool R600InstrInfo::isPredicable(const MachineInstr &MI) const { // be predicated. Until we have proper support for instruction clauses in the // backend, we will mark KILL* instructions as unpredicable. - if (MI.getOpcode() == AMDGPU::KILLGT) { + if (MI.getOpcode() == R600::KILLGT) { return false; - } else if (MI.getOpcode() == AMDGPU::CF_ALU) { + } else if (MI.getOpcode() == R600::CF_ALU) { // If the clause start in the middle of MBB then the MBB has more // than a single clause, unable to predicate several clauses. if (MI.getParent()->begin() != MachineBasicBlock::const_iterator(MI)) @@ -887,7 +893,7 @@ bool R600InstrInfo::isPredicable(const MachineInstr &MI) const { } else if (isVector(MI)) { return false; } else { - return AMDGPUInstrInfo::isPredicable(MI); + return TargetInstrInfo::isPredicable(MI); } } @@ -928,17 +934,17 @@ bool R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { MachineOperand &MO = Cond[1]; switch (MO.getImm()) { - case AMDGPU::PRED_SETE_INT: - MO.setImm(AMDGPU::PRED_SETNE_INT); + case R600::PRED_SETE_INT: + MO.setImm(R600::PRED_SETNE_INT); break; - case AMDGPU::PRED_SETNE_INT: - MO.setImm(AMDGPU::PRED_SETE_INT); + case R600::PRED_SETNE_INT: + MO.setImm(R600::PRED_SETE_INT); break; - case AMDGPU::PRED_SETE: - MO.setImm(AMDGPU::PRED_SETNE); + case R600::PRED_SETE: + MO.setImm(R600::PRED_SETNE); break; - case AMDGPU::PRED_SETNE: - MO.setImm(AMDGPU::PRED_SETE); + case R600::PRED_SETNE: + MO.setImm(R600::PRED_SETE); break; default: return true; @@ -946,11 +952,11 @@ R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) con MachineOperand &MO2 = Cond[2]; switch (MO2.getReg()) { - case AMDGPU::PRED_SEL_ZERO: - MO2.setReg(AMDGPU::PRED_SEL_ONE); + case R600::PRED_SEL_ZERO: + MO2.setReg(R600::PRED_SEL_ONE); break; - case AMDGPU::PRED_SEL_ONE: - MO2.setReg(AMDGPU::PRED_SEL_ZERO); + case R600::PRED_SEL_ONE: + MO2.setReg(R600::PRED_SEL_ZERO); break; default: return true; @@ -967,22 +973,22 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI, ArrayRef<MachineOperand> Pred) const { int PIdx = MI.findFirstPredOperandIdx(); - if (MI.getOpcode() == AMDGPU::CF_ALU) { + if (MI.getOpcode() == R600::CF_ALU) { MI.getOperand(8).setImm(0); return true; } - if (MI.getOpcode() == AMDGPU::DOT_4) { - MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_X)) + if (MI.getOpcode() == R600::DOT_4) { + MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_X)) .setReg(Pred[2].getReg()); - MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Y)) + MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_Y)) .setReg(Pred[2].getReg()); - MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Z)) + MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_Z)) .setReg(Pred[2].getReg()); - MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_W)) + MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_W)) .setReg(Pred[2].getReg()); MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); - MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); + MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit); return true; } @@ -990,7 +996,7 @@ bool R600InstrInfo::PredicateInstruction(MachineInstr &MI, MachineOperand &PMO = MI.getOperand(PIdx); PMO.setReg(Pred[2].getReg()); MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); - MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); + MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit); return true; } @@ -1020,20 +1026,20 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { default: { MachineBasicBlock *MBB = MI.getParent(); int OffsetOpIdx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::addr); + R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::addr); // addr is a custom operand with multiple MI operands, and only the // first MI operand is given a name. int RegOpIdx = OffsetOpIdx + 1; int ChanOpIdx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::chan); + R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::chan); if (isRegisterLoad(MI)) { int DstOpIdx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); + R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::dst); unsigned RegIndex = MI.getOperand(RegOpIdx).getImm(); unsigned Channel = MI.getOperand(ChanOpIdx).getImm(); unsigned Address = calculateIndirectAddress(RegIndex, Channel); unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg(); - if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { + if (OffsetReg == R600::INDIRECT_BASE_ADDR) { buildMovInstr(MBB, MI, MI.getOperand(DstOpIdx).getReg(), getIndirectAddrRegClass()->getRegister(Address)); } else { @@ -1042,12 +1048,12 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { } } else if (isRegisterStore(MI)) { int ValOpIdx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::val); + R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::val); unsigned RegIndex = MI.getOperand(RegOpIdx).getImm(); unsigned Channel = MI.getOperand(ChanOpIdx).getImm(); unsigned Address = calculateIndirectAddress(RegIndex, Channel); unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg(); - if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { + if (OffsetReg == R600::INDIRECT_BASE_ADDR) { buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address), MI.getOperand(ValOpIdx).getReg()); } else { @@ -1062,15 +1068,15 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MBB->erase(MI); return true; } - case AMDGPU::R600_EXTRACT_ELT_V2: - case AMDGPU::R600_EXTRACT_ELT_V4: + case R600::R600_EXTRACT_ELT_V2: + case R600::R600_EXTRACT_ELT_V4: buildIndirectRead(MI.getParent(), MI, MI.getOperand(0).getReg(), RI.getHWRegIndex(MI.getOperand(1).getReg()), // Address MI.getOperand(2).getReg(), RI.getHWRegChan(MI.getOperand(1).getReg())); break; - case AMDGPU::R600_INSERT_ELT_V2: - case AMDGPU::R600_INSERT_ELT_V4: + case R600::R600_INSERT_ELT_V2: + case R600::R600_INSERT_ELT_V4: buildIndirectWrite(MI.getParent(), MI, MI.getOperand(2).getReg(), // Value RI.getHWRegIndex(MI.getOperand(1).getReg()), // Address MI.getOperand(3).getReg(), // Offset @@ -1082,7 +1088,8 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { } void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, - const MachineFunction &MF) const { + const MachineFunction &MF, + const R600RegisterInfo &TRI) const { const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>(); const R600FrameLowering *TFL = ST.getFrameLowering(); @@ -1093,17 +1100,15 @@ void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, return; for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) { - unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index); - Reserved.set(SuperReg); for (unsigned Chan = 0; Chan < StackWidth; ++Chan) { - unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan); - Reserved.set(Reg); + unsigned Reg = R600::R600_TReg32RegClass.getRegister((4 * Index) + Chan); + TRI.reserveRegisterTuples(Reserved, Reg); } } } const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const { - return &AMDGPU::R600_TReg32_XRegClass; + return &R600::R600_TReg32_XRegClass; } MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, @@ -1121,20 +1126,20 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, unsigned AddrReg; switch (AddrChan) { default: llvm_unreachable("Invalid Channel"); - case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; - case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; - case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; - case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; + case 0: AddrReg = R600::R600_AddrRegClass.getRegister(Address); break; + case 1: AddrReg = R600::R600_Addr_YRegClass.getRegister(Address); break; + case 2: AddrReg = R600::R600_Addr_ZRegClass.getRegister(Address); break; + case 3: AddrReg = R600::R600_Addr_WRegClass.getRegister(Address); break; } - MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, - AMDGPU::AR_X, OffsetReg); - setImmOperand(*MOVA, AMDGPU::OpName::write, 0); + MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, R600::MOVA_INT_eg, + R600::AR_X, OffsetReg); + setImmOperand(*MOVA, R600::OpName::write, 0); - MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, + MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, R600::MOV, AddrReg, ValueReg) - .addReg(AMDGPU::AR_X, + .addReg(R600::AR_X, RegState::Implicit | RegState::Kill); - setImmOperand(*Mov, AMDGPU::OpName::dst_rel, 1); + setImmOperand(*Mov, R600::OpName::dst_rel, 1); return Mov; } @@ -1153,21 +1158,21 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, unsigned AddrReg; switch (AddrChan) { default: llvm_unreachable("Invalid Channel"); - case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; - case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; - case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; - case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; + case 0: AddrReg = R600::R600_AddrRegClass.getRegister(Address); break; + case 1: AddrReg = R600::R600_Addr_YRegClass.getRegister(Address); break; + case 2: AddrReg = R600::R600_Addr_ZRegClass.getRegister(Address); break; + case 3: AddrReg = R600::R600_Addr_WRegClass.getRegister(Address); break; } - MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, - AMDGPU::AR_X, + MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, R600::MOVA_INT_eg, + R600::AR_X, OffsetReg); - setImmOperand(*MOVA, AMDGPU::OpName::write, 0); - MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, + setImmOperand(*MOVA, R600::OpName::write, 0); + MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, R600::MOV, ValueReg, AddrReg) - .addReg(AMDGPU::AR_X, + .addReg(R600::AR_X, RegState::Implicit | RegState::Kill); - setImmOperand(*Mov, AMDGPU::OpName::src0_rel, 1); + setImmOperand(*Mov, R600::OpName::src0_rel, 1); return Mov; } @@ -1265,7 +1270,7 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB //XXX: The r600g finalizer expects this to be 1, once we've moved the //scheduling to the backend, we can change the default to 0. MIB.addImm(1) // $last - .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel + .addReg(R600::PRED_SEL_OFF) // $pred_sel .addImm(0) // $literal .addImm(0); // $bank_swizzle @@ -1286,23 +1291,23 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB static unsigned getSlotedOps(unsigned Op, unsigned Slot) { switch (Op) { - OPERAND_CASE(AMDGPU::OpName::update_exec_mask) - OPERAND_CASE(AMDGPU::OpName::update_pred) - OPERAND_CASE(AMDGPU::OpName::write) - OPERAND_CASE(AMDGPU::OpName::omod) - OPERAND_CASE(AMDGPU::OpName::dst_rel) - OPERAND_CASE(AMDGPU::OpName::clamp) - OPERAND_CASE(AMDGPU::OpName::src0) - OPERAND_CASE(AMDGPU::OpName::src0_neg) - OPERAND_CASE(AMDGPU::OpName::src0_rel) - OPERAND_CASE(AMDGPU::OpName::src0_abs) - OPERAND_CASE(AMDGPU::OpName::src0_sel) - OPERAND_CASE(AMDGPU::OpName::src1) - OPERAND_CASE(AMDGPU::OpName::src1_neg) - OPERAND_CASE(AMDGPU::OpName::src1_rel) - OPERAND_CASE(AMDGPU::OpName::src1_abs) - OPERAND_CASE(AMDGPU::OpName::src1_sel) - OPERAND_CASE(AMDGPU::OpName::pred_sel) + OPERAND_CASE(R600::OpName::update_exec_mask) + OPERAND_CASE(R600::OpName::update_pred) + OPERAND_CASE(R600::OpName::write) + OPERAND_CASE(R600::OpName::omod) + OPERAND_CASE(R600::OpName::dst_rel) + OPERAND_CASE(R600::OpName::clamp) + OPERAND_CASE(R600::OpName::src0) + OPERAND_CASE(R600::OpName::src0_neg) + OPERAND_CASE(R600::OpName::src0_rel) + OPERAND_CASE(R600::OpName::src0_abs) + OPERAND_CASE(R600::OpName::src0_sel) + OPERAND_CASE(R600::OpName::src1) + OPERAND_CASE(R600::OpName::src1_neg) + OPERAND_CASE(R600::OpName::src1_rel) + OPERAND_CASE(R600::OpName::src1_abs) + OPERAND_CASE(R600::OpName::src1_sel) + OPERAND_CASE(R600::OpName::pred_sel) default: llvm_unreachable("Wrong Operand"); } @@ -1313,39 +1318,39 @@ static unsigned getSlotedOps(unsigned Op, unsigned Slot) { MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg) const { - assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented"); + assert (MI->getOpcode() == R600::DOT_4 && "Not Implemented"); unsigned Opcode; - if (ST.getGeneration() <= R600Subtarget::R700) - Opcode = AMDGPU::DOT4_r600; + if (ST.getGeneration() <= AMDGPUSubtarget::R700) + Opcode = R600::DOT4_r600; else - Opcode = AMDGPU::DOT4_eg; + Opcode = R600::DOT4_eg; MachineBasicBlock::iterator I = MI; MachineOperand &Src0 = MI->getOperand( - getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src0, Slot))); + getOperandIdx(MI->getOpcode(), getSlotedOps(R600::OpName::src0, Slot))); MachineOperand &Src1 = MI->getOperand( - getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src1, Slot))); + getOperandIdx(MI->getOpcode(), getSlotedOps(R600::OpName::src1, Slot))); MachineInstr *MIB = buildDefaultInstruction( MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg()); static const unsigned Operands[14] = { - AMDGPU::OpName::update_exec_mask, - AMDGPU::OpName::update_pred, - AMDGPU::OpName::write, - AMDGPU::OpName::omod, - AMDGPU::OpName::dst_rel, - AMDGPU::OpName::clamp, - AMDGPU::OpName::src0_neg, - AMDGPU::OpName::src0_rel, - AMDGPU::OpName::src0_abs, - AMDGPU::OpName::src0_sel, - AMDGPU::OpName::src1_neg, - AMDGPU::OpName::src1_rel, - AMDGPU::OpName::src1_abs, - AMDGPU::OpName::src1_sel, + R600::OpName::update_exec_mask, + R600::OpName::update_pred, + R600::OpName::write, + R600::OpName::omod, + R600::OpName::dst_rel, + R600::OpName::clamp, + R600::OpName::src0_neg, + R600::OpName::src0_rel, + R600::OpName::src0_abs, + R600::OpName::src0_sel, + R600::OpName::src1_neg, + R600::OpName::src1_rel, + R600::OpName::src1_abs, + R600::OpName::src1_sel, }; MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(), - getSlotedOps(AMDGPU::OpName::pred_sel, Slot))); - MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel)) + getSlotedOps(R600::OpName::pred_sel, Slot))); + MIB->getOperand(getOperandIdx(Opcode, R600::OpName::pred_sel)) .setReg(MO.getReg()); for (unsigned i = 0; i < 14; i++) { @@ -1362,16 +1367,16 @@ MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB, MachineBasicBlock::iterator I, unsigned DstReg, uint64_t Imm) const { - MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg, - AMDGPU::ALU_LITERAL_X); - setImmOperand(*MovImm, AMDGPU::OpName::literal, Imm); + MachineInstr *MovImm = buildDefaultInstruction(BB, I, R600::MOV, DstReg, + R600::ALU_LITERAL_X); + setImmOperand(*MovImm, R600::OpName::literal, Imm); return MovImm; } MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, unsigned DstReg, unsigned SrcReg) const { - return buildDefaultInstruction(*MBB, I, AMDGPU::MOV, DstReg, SrcReg); + return buildDefaultInstruction(*MBB, I, R600::MOV, DstReg, SrcReg); } int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const { @@ -1379,7 +1384,7 @@ int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const { } int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const { - return AMDGPU::getNamedOperandIdx(Opcode, Op); + return R600::getNamedOperandIdx(Opcode, Op); } void R600InstrInfo::setImmOperand(MachineInstr &MI, unsigned Op, @@ -1406,25 +1411,25 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr &MI, unsigned SrcIdx, bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3; switch (Flag) { case MO_FLAG_CLAMP: - FlagIndex = getOperandIdx(MI, AMDGPU::OpName::clamp); + FlagIndex = getOperandIdx(MI, R600::OpName::clamp); break; case MO_FLAG_MASK: - FlagIndex = getOperandIdx(MI, AMDGPU::OpName::write); + FlagIndex = getOperandIdx(MI, R600::OpName::write); break; case MO_FLAG_NOT_LAST: case MO_FLAG_LAST: - FlagIndex = getOperandIdx(MI, AMDGPU::OpName::last); + FlagIndex = getOperandIdx(MI, R600::OpName::last); break; case MO_FLAG_NEG: switch (SrcIdx) { case 0: - FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_neg); + FlagIndex = getOperandIdx(MI, R600::OpName::src0_neg); break; case 1: - FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_neg); + FlagIndex = getOperandIdx(MI, R600::OpName::src1_neg); break; case 2: - FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src2_neg); + FlagIndex = getOperandIdx(MI, R600::OpName::src2_neg); break; } break; @@ -1435,10 +1440,10 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr &MI, unsigned SrcIdx, (void)IsOP3; switch (SrcIdx) { case 0: - FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_abs); + FlagIndex = getOperandIdx(MI, R600::OpName::src0_abs); break; case 1: - FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_abs); + FlagIndex = getOperandIdx(MI, R600::OpName::src1_abs); break; } break; @@ -1499,15 +1504,15 @@ unsigned R600InstrInfo::getAddressSpaceForPseudoSourceKind( switch (Kind) { case PseudoSourceValue::Stack: case PseudoSourceValue::FixedStack: - return AMDGPUASI.PRIVATE_ADDRESS; + return ST.getAMDGPUAS().PRIVATE_ADDRESS; case PseudoSourceValue::ConstantPool: case PseudoSourceValue::GOT: case PseudoSourceValue::JumpTable: case PseudoSourceValue::GlobalValueCallEntry: case PseudoSourceValue::ExternalSymbolCallEntry: case PseudoSourceValue::TargetCustom: - return AMDGPUASI.CONSTANT_ADDRESS; + return ST.getAMDGPUAS().CONSTANT_ADDRESS; } llvm_unreachable("Invalid pseudo source kind"); - return AMDGPUASI.PRIVATE_ADDRESS; + return ST.getAMDGPUAS().PRIVATE_ADDRESS; } diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h index abaa37450758..7a3dece31665 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.h +++ b/lib/Target/AMDGPU/R600InstrInfo.h @@ -8,15 +8,18 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Interface definition for R600InstrInfo +/// Interface definition for R600InstrInfo // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H #define LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H -#include "AMDGPUInstrInfo.h" #include "R600RegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" + +#define GET_INSTRINFO_HEADER +#include "R600GenInstrInfo.inc" namespace llvm { @@ -34,7 +37,7 @@ class MachineInstr; class MachineInstrBuilder; class R600Subtarget; -class R600InstrInfo final : public AMDGPUInstrInfo { +class R600InstrInfo final : public R600GenInstrInfo { private: const R600RegisterInfo RI; const R600Subtarget &ST; @@ -150,7 +153,7 @@ public: /// Same but using const index set instead of MI set. bool fitsConstReadLimitations(const std::vector<unsigned>&) const; - /// \brief Vector instructions are instructions that must fill all + /// Vector instructions are instructions that must fill all /// instruction slots within an instruction group. bool isVector(const MachineInstr &MI) const; @@ -209,9 +212,10 @@ public: bool expandPostRAPseudo(MachineInstr &MI) const override; - /// \brief Reserve the registers that may be accesed using indirect addressing. + /// Reserve the registers that may be accesed using indirect addressing. void reserveIndirectRegisters(BitVector &Reserved, - const MachineFunction &MF) const; + const MachineFunction &MF, + const R600RegisterInfo &TRI) const; /// Calculate the "Indirect Address" for the given \p RegIndex and /// \p Channel @@ -235,7 +239,7 @@ public: /// read or write or -1 if indirect addressing is not used by this program. int getIndirectIndexEnd(const MachineFunction &MF) const; - /// \brief Build instruction(s) for an indirect register write. + /// Build instruction(s) for an indirect register write. /// /// \returns The instruction that performs the indirect register write MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, @@ -243,7 +247,7 @@ public: unsigned ValueReg, unsigned Address, unsigned OffsetReg) const; - /// \brief Build instruction(s) for an indirect register read. + /// Build instruction(s) for an indirect register read. /// /// \returns The instruction that performs the indirect register read MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, @@ -281,23 +285,23 @@ public: MachineBasicBlock::iterator I, unsigned DstReg, unsigned SrcReg) const; - /// \brief Get the index of Op in the MachineInstr. + /// Get the index of Op in the MachineInstr. /// /// \returns -1 if the Instruction does not contain the specified \p Op. int getOperandIdx(const MachineInstr &MI, unsigned Op) const; - /// \brief Get the index of \p Op for the given Opcode. + /// Get the index of \p Op for the given Opcode. /// /// \returns -1 if the Instruction does not contain the specified \p Op. int getOperandIdx(unsigned Opcode, unsigned Op) const; - /// \brief Helper function for setting instruction flag values. + /// Helper function for setting instruction flag values. void setImmOperand(MachineInstr &MI, unsigned Op, int64_t Imm) const; - ///\brief Add one of the MO_FLAG* flags to the specified \p Operand. + ///Add one of the MO_FLAG* flags to the specified \p Operand. void addFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const; - ///\brief Determine if the specified \p Flag is set on this \p Operand. + ///Determine if the specified \p Flag is set on this \p Operand. bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const; /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2) @@ -307,7 +311,7 @@ public: MachineOperand &getFlagOp(MachineInstr &MI, unsigned SrcIdx = 0, unsigned Flag = 0) const; - /// \brief Clear the specified flag on the instruction. + /// Clear the specified flag on the instruction. void clearFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const; // Helper functions that check the opcode for status information @@ -323,7 +327,7 @@ public: PseudoSourceValue::PSVKind Kind) const override; }; -namespace AMDGPU { +namespace R600 { int getLDSNoRetOp(uint16_t Opcode); diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td index 801e4e61fca6..7bf174f4cd86 100644 --- a/lib/Target/AMDGPU/R600Instructions.td +++ b/lib/Target/AMDGPU/R600Instructions.td @@ -12,20 +12,19 @@ // //===----------------------------------------------------------------------===// -include "R600Intrinsics.td" include "R600InstrFormats.td" // FIXME: Should not be arbitrarily split from other R600 inst classes. class R600WrapperInst <dag outs, dag ins, string asm = "", list<dag> pattern = []> : AMDGPUInst<outs, ins, asm, pattern>, PredicateControl { let SubtargetPredicate = isR600toCayman; + let Namespace = "R600"; } class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern = []> : InstR600 <outs, ins, asm, pattern, NullALU> { - let Namespace = "AMDGPU"; } def MEMxi : Operand<iPTR> { @@ -81,11 +80,18 @@ def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>; def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>; def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>; def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>; +def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>; def R600_Pred : PredicateOperand<i32, (ops R600_Predicate), (ops PRED_SEL_OFF)>; +let isTerminator = 1, isReturn = 1, hasCtrlDep = 1, + usesCustomInserter = 1, Namespace = "R600" in { + def RETURN : ILFormat<(outs), (ins variable_ops), + "RETURN", [(AMDGPUendpgm)] + >; +} let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { @@ -219,34 +225,6 @@ class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern, } // End mayLoad = 1, mayStore = 0, hasSideEffects = 0 -def TEX_SHADOW : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return (TType >= 6 && TType <= 8) || TType == 13; - }] ->; - -def TEX_RECT : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return TType == 5; - }] ->; - -def TEX_ARRAY : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return TType == 9 || TType == 10 || TType == 16; - }] ->; - -def TEX_SHADOW_ARRAY : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return TType == 11 || TType == 12 || TType == 17; - }] ->; - class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask, dag outs, dag ins, string asm, list<dag> pattern> : InstR600ISA <outs, ins, asm, pattern>, @@ -357,6 +335,8 @@ def vtx_id2_load : LoadVtxId2 <load>; // R600 SDNodes //===----------------------------------------------------------------------===// +let Namespace = "R600" in { + def INTERP_PAIR_XY : AMDGPUShaderInst < (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1), (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2), @@ -369,6 +349,8 @@ def INTERP_PAIR_ZW : AMDGPUShaderInst < "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1", []>; +} + def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS", SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>, [SDNPVariadic] @@ -416,11 +398,15 @@ def : R600Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR, // Interpolation Instructions //===----------------------------------------------------------------------===// +let Namespace = "R600" in { + def INTERP_VEC_LOAD : AMDGPUShaderInst < (outs R600_Reg128:$dst), (ins i32imm:$src0), "INTERP_LOAD $src0 : $dst">; +} + def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> { let bank_swizzle = 5; } @@ -660,14 +646,7 @@ def PAD : R600WrapperInst <(outs), (ins), "PAD", [] > { let isCodeGenOnly = 1, isPseudo = 1 in { -let usesCustomInserter = 1 in { - -class CLAMP <RegisterClass rc> : AMDGPUShaderInst < - (outs rc:$dst), - (ins rc:$src0), - "CLAMP $dst, $src0", - [(set f32:$dst, (AMDGPUclamp f32:$src0))] ->; +let Namespace = "R600", usesCustomInserter = 1 in { class FABS <RegisterClass rc> : AMDGPUShaderInst < (outs rc:$dst), @@ -799,7 +778,9 @@ class MOV_IMM <ValueType vt, Operand immType> : R600WrapperInst < (ins immType:$imm), "", [] ->; +> { + let Namespace = "R600"; +} } // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 @@ -1014,7 +995,7 @@ class CNDGE_Common <bits<5> inst> : R600_3OP < } -let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { +let isCodeGenOnly = 1, isPseudo = 1, Namespace = "R600" in { class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs R600_Reg32:$dst), (ins // Slot X UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X, @@ -1193,7 +1174,6 @@ class COS_Common <bits<11> inst> : R600_1OP < let Itinerary = TransALU; } -def CLAMP_R600 : CLAMP <R600_Reg32>; def FABS_R600 : FABS<R600_Reg32>; def FNEG_R600 : FNEG<R600_Reg32>; @@ -1334,7 +1314,9 @@ let Predicates = [isR600] in { // Regist loads and stores - for indirect addressing //===----------------------------------------------------------------------===// +let Namespace = "R600" in { defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>; +} // Hardcode channel to 0 // NOTE: LSHR is not available here. LSHR is per family instruction @@ -1386,11 +1368,12 @@ let usesCustomInserter = 1 in { let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in { -def MASK_WRITE : AMDGPUShaderInst < +def MASK_WRITE : InstR600 < (outs), (ins R600_Reg32:$src), "MASK_WRITE $src", - [] + [], + NullALU >; } // End mayLoad = 0, mayStore = 0, hasSideEffects = 1 @@ -1421,7 +1404,7 @@ def TXD_SHADOW: InstR600 < // Constant Buffer Addressing Support //===----------------------------------------------------------------------===// -let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { +let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "R600" in { def CONST_COPY : Instruction { let OutOperandList = (outs R600_Reg32:$dst); let InOperandList = (ins i32imm:$src); @@ -1544,23 +1527,6 @@ let Inst{63-32} = Word1; //===---------------------------------------------------------------------===// // Flow and Program control Instructions //===---------------------------------------------------------------------===// -class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern> -: Instruction { - - let Namespace = "AMDGPU"; - dag OutOperandList = outs; - dag InOperandList = ins; - let Pattern = pattern; - let AsmString = !strconcat(asmstr, "\n"); - let isPseudo = 1; - let Itinerary = NullALU; - bit hasIEEEFlag = 0; - bit hasZeroOpFlag = 0; - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let isCodeGenOnly = 1; -} multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> { def _i32 : ILFormat<(outs), @@ -1592,23 +1558,14 @@ multiclass BranchInstr2<string name> { // Custom Inserter for Branches and returns, this eventually will be a // separate pass //===---------------------------------------------------------------------===// -let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in { +let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1, + Namespace = "R600" in { def BRANCH : ILFormat<(outs), (ins brtarget:$target), "; Pseudo unconditional branch instruction", [(br bb:$target)]>; defm BRANCH_COND : BranchConditional<IL_brcond, R600_Reg32, R600_Reg32>; } -//===---------------------------------------------------------------------===// -// Return instruction -//===---------------------------------------------------------------------===// -let isTerminator = 1, isReturn = 1, hasCtrlDep = 1, - usesCustomInserter = 1 in { - def RETURN : ILFormat<(outs), (ins variable_ops), - "RETURN", [(AMDGPUendpgm)] - >; -} - //===----------------------------------------------------------------------===// // Branch Instructions //===----------------------------------------------------------------------===// @@ -1738,13 +1695,8 @@ def : R600Pat < >; // KIL Patterns -def KILP : R600Pat < - (int_AMDGPU_kilp), - (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO))) ->; - def KIL : R600Pat < - (int_AMDGPU_kill f32:$src0), + (int_r600_kill f32:$src0), (MASK_WRITE (KILLGT (f32 ZERO), $src0)) >; diff --git a/lib/Target/AMDGPU/R600Intrinsics.td b/lib/Target/AMDGPU/R600Intrinsics.td deleted file mode 100644 index 4c9e1e8a5434..000000000000 --- a/lib/Target/AMDGPU/R600Intrinsics.td +++ /dev/null @@ -1,67 +0,0 @@ -//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// R600 Intrinsic Definitions -// -//===----------------------------------------------------------------------===// - -class TextureIntrinsicFloatInput : Intrinsic<[llvm_v4f32_ty], [ - llvm_v4f32_ty, // Coord - llvm_i32_ty, // offset_x - llvm_i32_ty, // offset_y, - llvm_i32_ty, // offset_z, - llvm_i32_ty, // resource_id - llvm_i32_ty, // samplerid - llvm_i32_ty, // coord_type_x - llvm_i32_ty, // coord_type_y - llvm_i32_ty, // coord_type_z - llvm_i32_ty], // coord_type_w - [IntrNoMem] ->; - -class TextureIntrinsicInt32Input : Intrinsic<[llvm_v4i32_ty], [ - llvm_v4i32_ty, // Coord - llvm_i32_ty, // offset_x - llvm_i32_ty, // offset_y, - llvm_i32_ty, // offset_z, - llvm_i32_ty, // resource_id - llvm_i32_ty, // samplerid - llvm_i32_ty, // coord_type_x - llvm_i32_ty, // coord_type_y - llvm_i32_ty, // coord_type_z - llvm_i32_ty], // coord_type_w - [IntrNoMem] ->; - -let TargetPrefix = "r600", isTarget = 1 in { - -def int_r600_store_swizzle : - Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [] ->; - -def int_r600_store_stream_output : Intrinsic< - [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [] ->; - -def int_r600_tex : TextureIntrinsicFloatInput; -def int_r600_texc : TextureIntrinsicFloatInput; -def int_r600_txl : TextureIntrinsicFloatInput; -def int_r600_txlc : TextureIntrinsicFloatInput; -def int_r600_txb : TextureIntrinsicFloatInput; -def int_r600_txbc : TextureIntrinsicFloatInput; -def int_r600_txf : TextureIntrinsicInt32Input; -def int_r600_txq : TextureIntrinsicInt32Input; -def int_r600_ddx : TextureIntrinsicFloatInput; -def int_r600_ddy : TextureIntrinsicFloatInput; - -def int_r600_dot4 : Intrinsic<[llvm_float_ty], - [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] ->; - -} // End TargetPrefix = "r600", isTarget = 1 diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp index a7e540f9d14d..a1429a2ac50f 100644 --- a/lib/Target/AMDGPU/R600MachineScheduler.cpp +++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp @@ -8,13 +8,14 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief R600 Machine Scheduler interface +/// R600 Machine Scheduler interface // //===----------------------------------------------------------------------===// #include "R600MachineScheduler.h" #include "AMDGPUSubtarget.h" #include "R600InstrInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/Pass.h" @@ -78,7 +79,7 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { AllowSwitchFromAlu = true; } else { unsigned NeededWF = 62.5f / ALUFetchRationEstimate; - DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" ); + LLVM_DEBUG(dbgs() << NeededWF << " approx. Wavefronts Required\n"); // We assume the local GPR requirements to be "dominated" by the requirement // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and // after TEX are indeed likely to consume or generate values from/for the @@ -124,26 +125,24 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { NextInstKind = IDOther; } - DEBUG( - if (SU) { - dbgs() << " ** Pick node **\n"; - SU->dump(DAG); - } else { - dbgs() << "NO NODE \n"; - for (unsigned i = 0; i < DAG->SUnits.size(); i++) { - const SUnit &S = DAG->SUnits[i]; - if (!S.isScheduled) - S.dump(DAG); - } - } - ); + LLVM_DEBUG(if (SU) { + dbgs() << " ** Pick node **\n"; + SU->dump(DAG); + } else { + dbgs() << "NO NODE \n"; + for (unsigned i = 0; i < DAG->SUnits.size(); i++) { + const SUnit &S = DAG->SUnits[i]; + if (!S.isScheduled) + S.dump(DAG); + } + }); return SU; } void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { if (NextInstKind != CurInstKind) { - DEBUG(dbgs() << "Instruction Type Switch\n"); + LLVM_DEBUG(dbgs() << "Instruction Type Switch\n"); if (NextInstKind != IDAlu) OccupedSlotsMask |= 31; CurEmitted = 0; @@ -163,7 +162,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(), E = SU->getInstr()->operands_end(); It != E; ++It) { MachineOperand &MO = *It; - if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) + if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X) ++CurEmitted; } } @@ -172,8 +171,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { ++CurEmitted; } - - DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n"); + LLVM_DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n"); if (CurInstKind != IDFetch) { MoveUnits(Pending[IDFetch], Available[IDFetch]); @@ -183,18 +181,18 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { static bool isPhysicalRegCopy(MachineInstr *MI) { - if (MI->getOpcode() != AMDGPU::COPY) + if (MI->getOpcode() != R600::COPY) return false; return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg()); } void R600SchedStrategy::releaseTopNode(SUnit *SU) { - DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG);); + LLVM_DEBUG(dbgs() << "Top Releasing "; SU->dump(DAG);); } void R600SchedStrategy::releaseBottomNode(SUnit *SU) { - DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG);); + LLVM_DEBUG(dbgs() << "Bottom Releasing "; SU->dump(DAG);); if (isPhysicalRegCopy(SU->getInstr())) { PhysicalRegCopy.push_back(SU); return; @@ -226,14 +224,14 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { return AluTrans; switch (MI->getOpcode()) { - case AMDGPU::PRED_X: + case R600::PRED_X: return AluPredX; - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT_4: + case R600::INTERP_PAIR_XY: + case R600::INTERP_PAIR_ZW: + case R600::INTERP_VEC_LOAD: + case R600::DOT_4: return AluT_XYZW; - case AMDGPU::COPY: + case R600::COPY: if (MI->getOperand(1).isUndef()) { // MI will become a KILL, don't considers it in scheduling return AluDiscarded; @@ -248,7 +246,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { if(TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode()) || TII->isReductionOp(MI->getOpcode()) || - MI->getOpcode() == AMDGPU::GROUP_BARRIER) { + MI->getOpcode() == R600::GROUP_BARRIER) { return AluT_XYZW; } @@ -259,13 +257,13 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { // Is the result already assigned to a channel ? unsigned DestSubReg = MI->getOperand(0).getSubReg(); switch (DestSubReg) { - case AMDGPU::sub0: + case R600::sub0: return AluT_X; - case AMDGPU::sub1: + case R600::sub1: return AluT_Y; - case AMDGPU::sub2: + case R600::sub2: return AluT_Z; - case AMDGPU::sub3: + case R600::sub3: return AluT_W; default: break; @@ -273,16 +271,16 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { // Is the result already member of a X/Y/Z/W class ? unsigned DestReg = MI->getOperand(0).getReg(); - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) || - regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass)) + if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) || + regBelongsToClass(DestReg, &R600::R600_AddrRegClass)) return AluT_X; - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass)) + if (regBelongsToClass(DestReg, &R600::R600_TReg32_YRegClass)) return AluT_Y; - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass)) + if (regBelongsToClass(DestReg, &R600::R600_TReg32_ZRegClass)) return AluT_Z; - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass)) + if (regBelongsToClass(DestReg, &R600::R600_TReg32_WRegClass)) return AluT_W; - if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass)) + if (regBelongsToClass(DestReg, &R600::R600_Reg128RegClass)) return AluT_XYZW; // LDS src registers cannot be used in the Trans slot. @@ -303,13 +301,13 @@ int R600SchedStrategy::getInstKind(SUnit* SU) { } switch (Opcode) { - case AMDGPU::PRED_X: - case AMDGPU::COPY: - case AMDGPU::CONST_COPY: - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT_4: + case R600::PRED_X: + case R600::COPY: + case R600::CONST_COPY: + case R600::INTERP_PAIR_XY: + case R600::INTERP_PAIR_ZW: + case R600::INTERP_VEC_LOAD: + case R600::DOT_4: return IDAlu; default: return IDOther; @@ -345,17 +343,17 @@ void R600SchedStrategy::LoadAlu() { } void R600SchedStrategy::PrepareNextSlot() { - DEBUG(dbgs() << "New Slot\n"); + LLVM_DEBUG(dbgs() << "New Slot\n"); assert (OccupedSlotsMask && "Slot wasn't filled"); OccupedSlotsMask = 0; -// if (HwGen == R600Subtarget::NORTHERN_ISLANDS) +// if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS) // OccupedSlotsMask |= 16; InstructionsGroupCandidate.clear(); LoadAlu(); } void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { - int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); + int DstIndex = TII->getOperandIdx(MI->getOpcode(), R600::OpName::dst); if (DstIndex == -1) { return; } @@ -372,16 +370,16 @@ void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { // Constrains the regclass of DestReg to assign it to Slot switch (Slot) { case 0: - MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass); + MRI->constrainRegClass(DestReg, &R600::R600_TReg32_XRegClass); break; case 1: - MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass); + MRI->constrainRegClass(DestReg, &R600::R600_TReg32_YRegClass); break; case 2: - MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass); + MRI->constrainRegClass(DestReg, &R600::R600_TReg32_ZRegClass); break; case 3: - MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass); + MRI->constrainRegClass(DestReg, &R600::R600_TReg32_WRegClass); break; } } @@ -461,7 +459,7 @@ SUnit* R600SchedStrategy::pickOther(int QID) { } if (!AQ.empty()) { SU = AQ.back(); - AQ.resize(AQ.size() - 1); + AQ.pop_back(); } return SU; } diff --git a/lib/Target/AMDGPU/R600MachineScheduler.h b/lib/Target/AMDGPU/R600MachineScheduler.h index 9a6770570477..8a9a8d3d1e23 100644 --- a/lib/Target/AMDGPU/R600MachineScheduler.h +++ b/lib/Target/AMDGPU/R600MachineScheduler.h @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief R600 Machine Scheduler interface +/// R600 Machine Scheduler interface // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp index cd71f19760b9..7de5e2c9577d 100644 --- a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp +++ b/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp @@ -1,4 +1,4 @@ -//===- AMDGPUOpenCLImageTypeLoweringPass.cpp ------------------------------===// +//===- R600OpenCLImageTypeLoweringPass.cpp ------------------------------===// // // The LLVM Compiler Infrastructure // @@ -153,7 +153,7 @@ PushArgMD(KernelArgMD &MD, const MDVector &V) { namespace { -class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass { +class R600OpenCLImageTypeLoweringPass : public ModulePass { static char ID; LLVMContext *Context; @@ -364,7 +364,7 @@ class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass { } public: - AMDGPUOpenCLImageTypeLoweringPass() : ModulePass(ID) {} + R600OpenCLImageTypeLoweringPass() : ModulePass(ID) {} bool runOnModule(Module &M) override { Context = &M.getContext(); @@ -376,14 +376,14 @@ public: } StringRef getPassName() const override { - return "AMDGPU OpenCL Image Type Pass"; + return "R600 OpenCL Image Type Pass"; } }; } // end anonymous namespace -char AMDGPUOpenCLImageTypeLoweringPass::ID = 0; +char R600OpenCLImageTypeLoweringPass::ID = 0; -ModulePass *llvm::createAMDGPUOpenCLImageTypeLoweringPass() { - return new AMDGPUOpenCLImageTypeLoweringPass(); +ModulePass *llvm::createR600OpenCLImageTypeLoweringPass() { + return new R600OpenCLImageTypeLoweringPass(); } diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index 4a14d95f1cc4..692451cb8fe0 100644 --- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -31,6 +31,7 @@ #include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600InstrInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" @@ -78,7 +79,7 @@ public: std::vector<unsigned> UndefReg; RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) { - assert(MI->getOpcode() == AMDGPU::REG_SEQUENCE); + assert(MI->getOpcode() == R600::REG_SEQUENCE); for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) { MachineOperand &MO = Instr->getOperand(i); unsigned Chan = Instr->getOperand(i + 1).getImm(); @@ -158,8 +159,8 @@ bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI) if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) return true; switch (MI.getOpcode()) { - case AMDGPU::R600_ExportSwz: - case AMDGPU::EG_ExportSwz: + case R600::R600_ExportSwz: + case R600::EG_ExportSwz: return true; default: return false; @@ -212,12 +213,12 @@ MachineInstr *R600VectorRegMerger::RebuildVector( std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg; for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(), E = RSI->RegToChan.end(); It != E; ++It) { - unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + unsigned DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass); unsigned SubReg = (*It).first; unsigned Swizzle = (*It).second; unsigned Chan = getReassignedChan(RemapChan, Swizzle); - MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG), + MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(R600::INSERT_SUBREG), DstReg) .addReg(SrcVec) .addReg(SubReg) @@ -228,20 +229,20 @@ MachineInstr *R600VectorRegMerger::RebuildVector( UpdatedUndef.erase(ChanPos); assert(!is_contained(UpdatedUndef, Chan) && "UpdatedUndef shouldn't contain Chan more than once!"); - DEBUG(dbgs() << " ->"; Tmp->dump();); + LLVM_DEBUG(dbgs() << " ->"; Tmp->dump();); (void)Tmp; SrcVec = DstReg; } MachineInstr *NewMI = - BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg).addReg(SrcVec); - DEBUG(dbgs() << " ->"; NewMI->dump();); + BuildMI(MBB, Pos, DL, TII->get(R600::COPY), Reg).addReg(SrcVec); + LLVM_DEBUG(dbgs() << " ->"; NewMI->dump();); - DEBUG(dbgs() << " Updating Swizzle:\n"); + LLVM_DEBUG(dbgs() << " Updating Swizzle:\n"); for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg), E = MRI->use_instr_end(); It != E; ++It) { - DEBUG(dbgs() << " ";(*It).dump(); dbgs() << " ->"); + LLVM_DEBUG(dbgs() << " "; (*It).dump(); dbgs() << " ->"); SwizzleInput(*It, RemapChan); - DEBUG((*It).dump()); + LLVM_DEBUG((*It).dump()); } RSI->Instr->eraseFromParent(); @@ -353,7 +354,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end(); MII != MIIE; ++MII) { MachineInstr &MI = *MII; - if (MI.getOpcode() != AMDGPU::REG_SEQUENCE) { + if (MI.getOpcode() != R600::REG_SEQUENCE) { if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) { unsigned Reg = MI.getOperand(1).getReg(); for (MachineRegisterInfo::def_instr_iterator @@ -372,14 +373,14 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { if (!areAllUsesSwizzeable(Reg)) continue; - DEBUG({ + LLVM_DEBUG({ dbgs() << "Trying to optimize "; MI.dump(); }); RegSeqInfo CandidateRSI; std::vector<std::pair<unsigned, unsigned>> RemapChan; - DEBUG(dbgs() << "Using common slots...\n";); + LLVM_DEBUG(dbgs() << "Using common slots...\n";); if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) { // Remove CandidateRSI mapping RemoveMI(CandidateRSI.Instr); @@ -387,7 +388,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { trackRSI(RSI); continue; } - DEBUG(dbgs() << "Using free slots...\n";); + LLVM_DEBUG(dbgs() << "Using free slots...\n";); RemapChan.clear(); if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) { RemoveMI(CandidateRSI.Instr); diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp index 7340318d2d88..612c62b514fd 100644 --- a/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/lib/Target/AMDGPU/R600Packetizer.cpp @@ -17,6 +17,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "R600InstrInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/DFAPacketizer.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -83,39 +84,39 @@ private: LastDstChan = BISlot; if (TII->isPredicated(*BI)) continue; - int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write); + int OperandIdx = TII->getOperandIdx(BI->getOpcode(), R600::OpName::write); if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0) continue; - int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst); + int DstIdx = TII->getOperandIdx(BI->getOpcode(), R600::OpName::dst); if (DstIdx == -1) { continue; } unsigned Dst = BI->getOperand(DstIdx).getReg(); if (isTrans || TII->isTransOnly(*BI)) { - Result[Dst] = AMDGPU::PS; + Result[Dst] = R600::PS; continue; } - if (BI->getOpcode() == AMDGPU::DOT4_r600 || - BI->getOpcode() == AMDGPU::DOT4_eg) { - Result[Dst] = AMDGPU::PV_X; + if (BI->getOpcode() == R600::DOT4_r600 || + BI->getOpcode() == R600::DOT4_eg) { + Result[Dst] = R600::PV_X; continue; } - if (Dst == AMDGPU::OQAP) { + if (Dst == R600::OQAP) { continue; } unsigned PVReg = 0; switch (TRI.getHWRegChan(Dst)) { case 0: - PVReg = AMDGPU::PV_X; + PVReg = R600::PV_X; break; case 1: - PVReg = AMDGPU::PV_Y; + PVReg = R600::PV_Y; break; case 2: - PVReg = AMDGPU::PV_Z; + PVReg = R600::PV_Z; break; case 3: - PVReg = AMDGPU::PV_W; + PVReg = R600::PV_W; break; default: llvm_unreachable("Invalid Chan"); @@ -128,9 +129,9 @@ private: void substitutePV(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PVs) const { unsigned Ops[] = { - AMDGPU::OpName::src0, - AMDGPU::OpName::src1, - AMDGPU::OpName::src2 + R600::OpName::src0, + R600::OpName::src1, + R600::OpName::src2 }; for (unsigned i = 0; i < 3; i++) { int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]); @@ -170,7 +171,7 @@ public: return true; if (!TII->isALUInstr(MI.getOpcode())) return true; - if (MI.getOpcode() == AMDGPU::GROUP_BARRIER) + if (MI.getOpcode() == R600::GROUP_BARRIER) return true; // XXX: This can be removed once the packetizer properly handles all the // LDS instruction group restrictions. @@ -184,8 +185,8 @@ public: if (getSlot(*MII) == getSlot(*MIJ)) ConsideredInstUsesAlreadyWrittenVectorElement = true; // Does MII and MIJ share the same pred_sel ? - int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel), - OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel); + int OpI = TII->getOperandIdx(MII->getOpcode(), R600::OpName::pred_sel), + OpJ = TII->getOperandIdx(MIJ->getOpcode(), R600::OpName::pred_sel); unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0, PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0; if (PredI != PredJ) @@ -219,7 +220,7 @@ public: } void setIsLastBit(MachineInstr *MI, unsigned Bit) const { - unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last); + unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), R600::OpName::last); MI->getOperand(LastOp).setImm(Bit); } @@ -236,7 +237,7 @@ public: if (ConsideredInstUsesAlreadyWrittenVectorElement && !TII->isVectorOnly(MI) && VLIW5) { isTransSlot = true; - DEBUG({ + LLVM_DEBUG({ dbgs() << "Considering as Trans Inst :"; MI.dump(); }); @@ -249,7 +250,7 @@ public: // Are the Constants limitations met ? CurrentPacketMIs.push_back(&MI); if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) { - DEBUG({ + LLVM_DEBUG({ dbgs() << "Couldn't pack :\n"; MI.dump(); dbgs() << "with the following packets :\n"; @@ -266,7 +267,7 @@ public: // Is there a BankSwizzle set that meet Read Port limitations ? if (!TII->fitsReadPortLimitations(CurrentPacketMIs, PV, BS, isTransSlot)) { - DEBUG({ + LLVM_DEBUG({ dbgs() << "Couldn't pack :\n"; MI.dump(); dbgs() << "with the following packets :\n"; @@ -300,11 +301,11 @@ public: for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) { MachineInstr *MI = CurrentPacketMIs[i]; unsigned Op = TII->getOperandIdx(MI->getOpcode(), - AMDGPU::OpName::bank_swizzle); + R600::OpName::bank_swizzle); MI->getOperand(Op).setImm(BS[i]); } unsigned Op = - TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::bank_swizzle); + TII->getOperandIdx(MI.getOpcode(), R600::OpName::bank_swizzle); MI.getOperand(Op).setImm(BS.back()); if (!CurrentPacketMIs.empty()) setIsLastBit(CurrentPacketMIs.back(), 0); @@ -333,6 +334,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { // DFA state table should not be empty. assert(Packetizer.getResourceTracker() && "Empty DFA table!"); + assert(Packetizer.getResourceTracker()->getInstrItins()); if (Packetizer.getResourceTracker()->getInstrItins()->isEmpty()) return false; @@ -352,8 +354,8 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { MachineBasicBlock::iterator End = MBB->end(); MachineBasicBlock::iterator MI = MBB->begin(); while (MI != End) { - if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF || - (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) { + if (MI->isKill() || MI->getOpcode() == R600::IMPLICIT_DEF || + (MI->getOpcode() == R600::CF_ALU && !MI->getOperand(8).getImm())) { MachineBasicBlock::iterator DeleteMI = MI; ++MI; MBB->erase(DeleteMI); diff --git a/lib/Target/AMDGPU/R600Processors.td b/lib/Target/AMDGPU/R600Processors.td index 89194dc1bdf6..f39b3dc1bfd4 100644 --- a/lib/Target/AMDGPU/R600Processors.td +++ b/lib/Target/AMDGPU/R600Processors.td @@ -7,6 +7,62 @@ // //===----------------------------------------------------------------------===// +class SubtargetFeatureFetchLimit <string Value> : + SubtargetFeature <"fetch"#Value, + "TexVTXClauseSize", + Value, + "Limit the maximum number of fetches in a clause to "#Value +>; + +def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", + "R600ALUInst", + "false", + "Older version of ALU instructions encoding" +>; + +def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">; +def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">; + +def FeatureVertexCache : SubtargetFeature<"HasVertexCache", + "HasVertexCache", + "true", + "Specify use of dedicated vertex cache" +>; + +def FeatureCaymanISA : SubtargetFeature<"caymanISA", + "CaymanISA", + "true", + "Use Cayman ISA" +>; + +def FeatureCFALUBug : SubtargetFeature<"cfalubug", + "CFALUBug", + "true", + "GPU has CF_ALU bug" +>; + +class R600SubtargetFeatureGeneration <string Value, + list<SubtargetFeature> Implies> : + SubtargetFeatureGeneration <Value, "R600Subtarget", Implies>; + +def FeatureR600 : R600SubtargetFeatureGeneration<"R600", + [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0] +>; + +def FeatureR700 : R600SubtargetFeatureGeneration<"R700", + [FeatureFetchLimit16, FeatureLocalMemorySize0] +>; + +def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN", + [FeatureFetchLimit16, FeatureLocalMemorySize32768] +>; + +def FeatureNorthernIslands : R600SubtargetFeatureGeneration<"NORTHERN_ISLANDS", + [FeatureFetchLimit16, FeatureWavefrontSize64, + FeatureLocalMemorySize32768] +>; + + //===----------------------------------------------------------------------===// // Radeon HD 2000/3000 Series (R600). //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp index 7501facb0cba..38933e7616a0 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.cpp +++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief R600 implementation of the TargetRegisterInfo class. +/// R600 implementation of the TargetRegisterInfo class. // //===----------------------------------------------------------------------===// @@ -17,47 +17,51 @@ #include "R600Defines.h" #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" using namespace llvm; -R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() { +R600RegisterInfo::R600RegisterInfo() : R600GenRegisterInfo(0) { RCW.RegWeight = 0; RCW.WeightLimit = 0; } +#define GET_REGINFO_TARGET_DESC +#include "R600GenRegisterInfo.inc" + BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>(); const R600InstrInfo *TII = ST.getInstrInfo(); - Reserved.set(AMDGPU::ZERO); - Reserved.set(AMDGPU::HALF); - Reserved.set(AMDGPU::ONE); - Reserved.set(AMDGPU::ONE_INT); - Reserved.set(AMDGPU::NEG_HALF); - Reserved.set(AMDGPU::NEG_ONE); - Reserved.set(AMDGPU::PV_X); - Reserved.set(AMDGPU::ALU_LITERAL_X); - Reserved.set(AMDGPU::ALU_CONST); - Reserved.set(AMDGPU::PREDICATE_BIT); - Reserved.set(AMDGPU::PRED_SEL_OFF); - Reserved.set(AMDGPU::PRED_SEL_ZERO); - Reserved.set(AMDGPU::PRED_SEL_ONE); - Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); - - for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(), - E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) { - Reserved.set(*I); + reserveRegisterTuples(Reserved, R600::ZERO); + reserveRegisterTuples(Reserved, R600::HALF); + reserveRegisterTuples(Reserved, R600::ONE); + reserveRegisterTuples(Reserved, R600::ONE_INT); + reserveRegisterTuples(Reserved, R600::NEG_HALF); + reserveRegisterTuples(Reserved, R600::NEG_ONE); + reserveRegisterTuples(Reserved, R600::PV_X); + reserveRegisterTuples(Reserved, R600::ALU_LITERAL_X); + reserveRegisterTuples(Reserved, R600::ALU_CONST); + reserveRegisterTuples(Reserved, R600::PREDICATE_BIT); + reserveRegisterTuples(Reserved, R600::PRED_SEL_OFF); + reserveRegisterTuples(Reserved, R600::PRED_SEL_ZERO); + reserveRegisterTuples(Reserved, R600::PRED_SEL_ONE); + reserveRegisterTuples(Reserved, R600::INDIRECT_BASE_ADDR); + + for (TargetRegisterClass::iterator I = R600::R600_AddrRegClass.begin(), + E = R600::R600_AddrRegClass.end(); I != E; ++I) { + reserveRegisterTuples(Reserved, *I); } - TII->reserveIndirectRegisters(Reserved, MF); + TII->reserveIndirectRegisters(Reserved, MF, *this); return Reserved; } // Dummy to not crash RegisterClassInfo. -static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister; +static const MCPhysReg CalleeSavedReg = R600::NoRegister; const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs( const MachineFunction *) const { @@ -65,7 +69,7 @@ const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs( } unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const { - return AMDGPU::NoRegister; + return R600::NoRegister; } unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const { @@ -80,7 +84,7 @@ const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass( MVT VT) const { switch(VT.SimpleTy) { default: - case MVT::i32: return &AMDGPU::R600_TReg32RegClass; + case MVT::i32: return &R600::R600_TReg32RegClass; } } @@ -93,9 +97,9 @@ bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const { assert(!TargetRegisterInfo::isVirtualRegister(Reg)); switch (Reg) { - case AMDGPU::OQAP: - case AMDGPU::OQBP: - case AMDGPU::AR_X: + case R600::OQAP: + case R600::OQBP: + case R600::AR_X: return false; default: return true; @@ -108,3 +112,10 @@ void R600RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, RegScavenger *RS) const { llvm_unreachable("Subroutines not supported yet"); } + +void R600RegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const { + MCRegAliasIterator R(Reg, this, true); + + for (; R.isValid(); ++R) + Reserved.set(*R); +} diff --git a/lib/Target/AMDGPU/R600RegisterInfo.h b/lib/Target/AMDGPU/R600RegisterInfo.h index f0d9644b02f2..c4c77172b299 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.h +++ b/lib/Target/AMDGPU/R600RegisterInfo.h @@ -8,20 +8,19 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Interface definition for R600RegisterInfo +/// Interface definition for R600RegisterInfo // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H #define LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H -#include "AMDGPURegisterInfo.h" +#define GET_REGINFO_HEADER +#include "R600GenRegisterInfo.inc" namespace llvm { -class AMDGPUSubtarget; - -struct R600RegisterInfo final : public AMDGPURegisterInfo { +struct R600RegisterInfo final : public R600GenRegisterInfo { RegClassWeight RCW; R600RegisterInfo(); @@ -30,12 +29,12 @@ struct R600RegisterInfo final : public AMDGPURegisterInfo { const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; unsigned getFrameRegister(const MachineFunction &MF) const override; - /// \brief get the HW encoding for a register's channel. + /// get the HW encoding for a register's channel. unsigned getHWRegChan(unsigned reg) const; unsigned getHWRegIndex(unsigned Reg) const; - /// \brief get the register class of the specified type to use in the + /// get the register class of the specified type to use in the /// CFGStructurizer const TargetRegisterClass *getCFGStructurizerRegClass(MVT VT) const; @@ -49,6 +48,8 @@ struct R600RegisterInfo final : public AMDGPURegisterInfo { void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS = nullptr) const override; + + void reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td index 84ab328bdb2b..02164b74a01b 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.td +++ b/lib/Target/AMDGPU/R600RegisterInfo.td @@ -245,7 +245,7 @@ def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add V0123_W, V0123_Z, V0123_Y, V0123_X) >; -def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, +def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32, i64, f64], 64, (add (sequence "T%u_XY", 0, 63))>; def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index 150d8c3dc3d3..74f1bd8fb986 100644 --- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constant.h" @@ -37,7 +38,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <utility> @@ -133,7 +133,7 @@ INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE, char SIAnnotateControlFlow::ID = 0; -/// \brief Initialize all the types and constants used in the pass +/// Initialize all the types and constants used in the pass bool SIAnnotateControlFlow::doInitialization(Module &M) { LLVMContext &Context = M.getContext(); @@ -157,29 +157,29 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) { return false; } -/// \brief Is the branch condition uniform or did the StructurizeCFG pass +/// Is the branch condition uniform or did the StructurizeCFG pass /// consider it as such? bool SIAnnotateControlFlow::isUniform(BranchInst *T) { return DA->isUniform(T->getCondition()) || T->getMetadata("structurizecfg.uniform") != nullptr; } -/// \brief Is BB the last block saved on the stack ? +/// Is BB the last block saved on the stack ? bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) { return !Stack.empty() && Stack.back().first == BB; } -/// \brief Pop the last saved value from the control flow stack +/// Pop the last saved value from the control flow stack Value *SIAnnotateControlFlow::popSaved() { return Stack.pop_back_val().second; } -/// \brief Push a BB and saved value to the control flow stack +/// Push a BB and saved value to the control flow stack void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) { Stack.push_back(std::make_pair(BB, Saved)); } -/// \brief Can the condition represented by this PHI node treated like +/// Can the condition represented by this PHI node treated like /// an "Else" block? bool SIAnnotateControlFlow::isElse(PHINode *Phi) { BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock(); @@ -198,14 +198,14 @@ bool SIAnnotateControlFlow::isElse(PHINode *Phi) { return true; } -// \brief Erase "Phi" if it is not used any more +// Erase "Phi" if it is not used any more void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { if (RecursivelyDeleteDeadPHINode(Phi)) { - DEBUG(dbgs() << "Erased unused condition phi\n"); + LLVM_DEBUG(dbgs() << "Erased unused condition phi\n"); } } -/// \brief Open a new "If" block +/// Open a new "If" block void SIAnnotateControlFlow::openIf(BranchInst *Term) { if (isUniform(Term)) return; @@ -215,7 +215,7 @@ void SIAnnotateControlFlow::openIf(BranchInst *Term) { push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); } -/// \brief Close the last "If" block and open a new "Else" block +/// Close the last "If" block and open a new "Else" block void SIAnnotateControlFlow::insertElse(BranchInst *Term) { if (isUniform(Term)) { return; @@ -225,7 +225,7 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) { push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); } -/// \brief Recursively handle the condition leading to a loop +/// Recursively handle the condition leading to a loop Value *SIAnnotateControlFlow::handleLoopCondition( Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term, SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions) { @@ -322,7 +322,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition( llvm_unreachable("Unhandled loop condition!"); } -/// \brief Handle a back edge (loop) +/// Handle a back edge (loop) void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { if (isUniform(Term)) return; @@ -353,7 +353,7 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { push(Term->getSuccessor(0), Arg); } -/// \brief Close the last opened control flow +/// Close the last opened control flow void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { llvm::Loop *L = LI->getLoopFor(BB); @@ -381,7 +381,7 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { CallInst::Create(EndCf, Exec, "", FirstInsertionPt); } -/// \brief Annotate the control flow with intrinsics so the backend can +/// Annotate the control flow with intrinsics so the backend can /// recognize if/then/else and loops. bool SIAnnotateControlFlow::runOnFunction(Function &F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); @@ -422,11 +422,15 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) { openIf(Term); } - assert(Stack.empty()); + if (!Stack.empty()) { + // CFG was probably not structured. + report_fatal_error("failed to annotate CFG"); + } + return true; } -/// \brief Create the annotation pass +/// Create the annotation pass FunctionPass *llvm::createSIAnnotateControlFlowPass() { return new SIAnnotateControlFlow(); } diff --git a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp index b5c439b21b89..7e884ad93a23 100644 --- a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp +++ b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Inserts one nop instruction for each high level source statement for +/// Inserts one nop instruction for each high level source statement for /// debugger usage. /// /// Tools, such as a debugger, need to pause execution based on user input (i.e. @@ -21,6 +21,7 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/DenseSet.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -62,7 +63,7 @@ FunctionPass *llvm::createSIDebuggerInsertNopsPass() { bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) { // Skip this pass if "amdgpu-debugger-insert-nops" attribute was not // specified. - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (!ST.debuggerInsertNops()) return false; @@ -78,8 +79,8 @@ bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) { for (auto &MBB : MF) { for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { - // Skip DBG_VALUE instructions and instructions without location. - if (MI->isDebugValue() || !MI->getDebugLoc()) + // Skip debug instructions and instructions without location. + if (MI->isDebugInstr() || !MI->getDebugLoc()) continue; // Insert nop instruction if line number does not have nop inserted. diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index a9f6069e798a..a6d28d6999e5 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -85,7 +85,10 @@ enum : uint64_t { ClampHi = UINT64_C(1) << 48, // Is a packed VOP3P instruction. - IsPacked = UINT64_C(1) << 49 + IsPacked = UINT64_C(1) << 49, + + // Is a D16 buffer instruction. + D16Buf = UINT64_C(1) << 50 }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. @@ -137,7 +140,6 @@ namespace AMDGPU { OPERAND_INPUT_MODS, // Operand for SDWA instructions - OPERAND_SDWA_SRC, OPERAND_SDWA_VOPC_DST, /// Operand with 32-bit immediate that uses the constant bus. @@ -146,6 +148,13 @@ namespace AMDGPU { }; } +namespace SIStackID { +enum StackTypes : uint8_t { + SCRATCH = 0, + SGPR_SPILL = 1 +}; +} + // Input operand modifiers bit-masks // NEG and SEXT share same bit-mask because they can't be set simultaneously. namespace SISrcMods { @@ -273,8 +282,9 @@ enum Id { // HwRegCode, (6) [5:0] ID_GPR_ALLOC = 5, ID_LDS_ALLOC = 6, ID_IB_STS = 7, - ID_SYMBOLIC_LAST_ = 8, ID_MEM_BASES = 15, + ID_SYMBOLIC_FIRST_GFX9_ = ID_MEM_BASES, + ID_SYMBOLIC_LAST_ = 16, ID_SHIFT_ = 0, ID_WIDTH_ = 6, ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_) @@ -375,6 +385,44 @@ enum SDWA9EncValues{ }; } // namespace SDWA + +namespace DPP { + +enum DppCtrl { + QUAD_PERM_FIRST = 0, + QUAD_PERM_LAST = 0xFF, + DPP_UNUSED1 = 0x100, + ROW_SHL0 = 0x100, + ROW_SHL_FIRST = 0x101, + ROW_SHL_LAST = 0x10F, + DPP_UNUSED2 = 0x110, + ROW_SHR0 = 0x110, + ROW_SHR_FIRST = 0x111, + ROW_SHR_LAST = 0x11F, + DPP_UNUSED3 = 0x120, + ROW_ROR0 = 0x120, + ROW_ROR_FIRST = 0x121, + ROW_ROR_LAST = 0x12F, + WAVE_SHL1 = 0x130, + DPP_UNUSED4_FIRST = 0x131, + DPP_UNUSED4_LAST = 0x133, + WAVE_ROL1 = 0x134, + DPP_UNUSED5_FIRST = 0x135, + DPP_UNUSED5_LAST = 0x137, + WAVE_SHR1 = 0x138, + DPP_UNUSED6_FIRST = 0x139, + DPP_UNUSED6_LAST = 0x13B, + WAVE_ROR1 = 0x13C, + DPP_UNUSED7_FIRST = 0x13D, + DPP_UNUSED7_LAST = 0x13F, + ROW_MIRROR = 0x140, + ROW_HALF_MIRROR = 0x141, + BCAST15 = 0x142, + BCAST31 = 0x143, + DPP_LAST = BCAST31 +}; + +} // namespace DPP } // namespace AMDGPU #define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 8b155c2d2780..566e0d3febc7 100644 --- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -69,6 +69,7 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" @@ -81,7 +82,6 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/Pass.h" #include "llvm/Support/CodeGen.h" @@ -110,12 +110,7 @@ namespace { class SIFixSGPRCopies : public MachineFunctionPass { MachineDominatorTree *MDT; - MachinePostDominatorTree *MPDT; - DenseMap<MachineBasicBlock *, SetVector<MachineBasicBlock*>> PDF; - void computePDF(MachineFunction * MF); -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - void printPDF(); -#endif + public: static char ID; @@ -128,8 +123,6 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineDominatorTree>(); AU.addPreserved<MachineDominatorTree>(); - AU.addRequired<MachinePostDominatorTree>(); - AU.addPreserved<MachinePostDominatorTree>(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -417,6 +410,12 @@ bool searchPredecessors(const MachineBasicBlock *MBB, return false; } +static bool predsHasDivergentTerminator(MachineBasicBlock *MBB, + const TargetRegisterInfo *TRI) { + return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) { + return hasTerminatorThatModifiesExec(*MBB, *TRI); }); +} + // Checks if there is potential path From instruction To instruction. // If CutOff is specified and it sits in between of that path we ignore // a higher portion of the path and report it is not reachable. @@ -515,9 +514,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, if (MDT.dominates(MI1, MI2)) { if (!intereferes(MI2, MI1)) { - DEBUG(dbgs() << "Erasing from " - << printMBBReference(*MI2->getParent()) << " " - << *MI2); + LLVM_DEBUG(dbgs() + << "Erasing from " + << printMBBReference(*MI2->getParent()) << " " << *MI2); MI2->eraseFromParent(); Defs.erase(I2++); Changed = true; @@ -525,9 +524,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, } } else if (MDT.dominates(MI2, MI1)) { if (!intereferes(MI1, MI2)) { - DEBUG(dbgs() << "Erasing from " - << printMBBReference(*MI1->getParent()) << " " - << *MI1); + LLVM_DEBUG(dbgs() + << "Erasing from " + << printMBBReference(*MI1->getParent()) << " " << *MI1); MI1->eraseFromParent(); Defs.erase(I1++); Changed = true; @@ -543,11 +542,12 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); if (!intereferes(MI1, I) && !intereferes(MI2, I)) { - DEBUG(dbgs() << "Erasing from " - << printMBBReference(*MI1->getParent()) << " " << *MI1 - << "and moving from " - << printMBBReference(*MI2->getParent()) << " to " - << printMBBReference(*I->getParent()) << " " << *MI2); + LLVM_DEBUG(dbgs() + << "Erasing from " + << printMBBReference(*MI1->getParent()) << " " << *MI1 + << "and moving from " + << printMBBReference(*MI2->getParent()) << " to " + << printMBBReference(*I->getParent()) << " " << *MI2); I->getParent()->splice(I, MI2->getParent(), MI2); MI1->eraseFromParent(); Defs.erase(I1++); @@ -567,47 +567,12 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, return Changed; } -void SIFixSGPRCopies::computePDF(MachineFunction *MF) { - MachineFunction::iterator B = MF->begin(); - MachineFunction::iterator E = MF->end(); - for (; B != E; ++B) { - if (B->succ_size() > 1) { - for (auto S : B->successors()) { - MachineDomTreeNode *runner = MPDT->getNode(&*S); - MachineDomTreeNode *sentinel = MPDT->getNode(&*B)->getIDom(); - while (runner && runner != sentinel) { - PDF[runner->getBlock()].insert(&*B); - runner = runner->getIDom(); - } - } - } - } -} - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void SIFixSGPRCopies::printPDF() { - dbgs() << "\n######## PostDominanceFrontiers set #########\n"; - for (auto &I : PDF) { - dbgs() << "PDF[ " << I.first->getNumber() << "] : "; - for (auto &J : I.second) { - dbgs() << J->getNumber() << ' '; - } - dbgs() << '\n'; - } - dbgs() << "\n##############################################\n"; -} -#endif - bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); MachineRegisterInfo &MRI = MF.getRegInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); MDT = &getAnalysis<MachineDominatorTree>(); - MPDT = &getAnalysis<MachinePostDominatorTree>(); - PDF.clear(); - computePDF(&MF); - DEBUG(printPDF()); SmallVector<MachineInstr *, 16> Worklist; @@ -661,28 +626,17 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) break; - // We don't need to fix the PHI if all the source blocks - // have no divergent control dependecies + // We don't need to fix the PHI if the common dominator of the + // two incoming blocks terminates with a uniform branch. bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII); - if (!HasVGPROperand) { - bool Uniform = true; - MachineBasicBlock * Join = MI.getParent(); - for (auto &O : MI.explicit_operands()) { - if (O.isMBB()) { - MachineBasicBlock * Source = O.getMBB(); - SetVector<MachineBasicBlock*> &SourcePDF = PDF[Source]; - SetVector<MachineBasicBlock*> &JoinPDF = PDF[Join]; - SetVector<MachineBasicBlock*> CDList; - for (auto &I : SourcePDF) { - if (!JoinPDF.count(I) || /* back edge */MDT->dominates(Join, I)) { - if (hasTerminatorThatModifiesExec(*I, *TRI)) - Uniform = false; - } - } - } - } - if (Uniform) { - DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n'); + if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) { + MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB(); + MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB(); + + if (!predsHasDivergentTerminator(MBB0, TRI) && + !predsHasDivergentTerminator(MBB1, TRI)) { + LLVM_DEBUG(dbgs() + << "Not fixing PHI for uniform branch: " << MI << '\n'); break; } } @@ -722,7 +676,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { SmallSet<unsigned, 8> Visited; if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) { - DEBUG(dbgs() << "Fixing PHI: " << MI); + LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); TII->moveToVALU(MI); } break; @@ -734,7 +688,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { continue; } - DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); + LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); TII->moveToVALU(MI); break; @@ -745,7 +699,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { Src1RC = MRI.getRegClass(MI.getOperand(2).getReg()); if (TRI->isSGPRClass(DstRC) && (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) { - DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); + LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); TII->moveToVALU(MI); } break; diff --git a/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp index 3d3121788b5e..15ba78edf919 100644 --- a/lib/Target/AMDGPU/SIFixVGPRCopies.cpp +++ b/lib/Target/AMDGPU/SIFixVGPRCopies.cpp @@ -8,13 +8,14 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Add implicit use of exec to vector register copies. +/// Add implicit use of exec to vector register copies. /// //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/MachineFunctionPass.h" using namespace llvm; @@ -46,7 +47,7 @@ char SIFixVGPRCopies::ID = 0; char &llvm::SIFixVGPRCopiesID = SIFixVGPRCopies::ID; bool SIFixVGPRCopies::runOnMachineFunction(MachineFunction &MF) { - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); bool Changed = false; @@ -58,7 +59,7 @@ bool SIFixVGPRCopies::runOnMachineFunction(MachineFunction &MF) { if (TII->isVGPRCopy(MI) && !MI.readsRegister(AMDGPU::EXEC, TRI)) { MI.addOperand(MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); - DEBUG(dbgs() << "Add exec use to " << MI); + LLVM_DEBUG(dbgs() << "Add exec use to " << MI); Changed = true; } break; diff --git a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp index 3493c7775f0c..5d613d8874fa 100644 --- a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp +++ b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Computations in WWM can overwrite values in inactive channels for +/// Computations in WWM can overwrite values in inactive channels for /// variables that the register allocator thinks are dead. This pass adds fake /// uses of those variables to WWM instructions to make sure that they aren't /// overwritten. @@ -55,6 +55,7 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SparseBitVector.h" #include "llvm/CodeGen/LiveIntervals.h" @@ -184,7 +185,7 @@ bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) { // This doesn't actually need LiveIntervals, but we can preserve them. LIS = getAnalysisIfAvailable<LiveIntervals>(); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index 783181980342..338cabcb906b 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -13,6 +13,7 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -75,7 +76,7 @@ public: MachineRegisterInfo *MRI; const SIInstrInfo *TII; const SIRegisterInfo *TRI; - const SISubtarget *ST; + const GCNSubtarget *ST; void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, @@ -127,14 +128,18 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII, unsigned Opc = UseMI.getOpcode(); switch (Opc) { case AMDGPU::V_MAC_F32_e64: - case AMDGPU::V_MAC_F16_e64: { + case AMDGPU::V_MAC_F16_e64: + case AMDGPU::V_FMAC_F32_e64: { // Special case for mac. Since this is replaced with mad when folded into // src2, we need to check the legality for the final instruction. int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); if (static_cast<int>(OpNo) == Src2Idx) { + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64; bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; - const MCInstrDesc &MadDesc - = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); + + unsigned Opc = IsFMA ? + AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); + const MCInstrDesc &MadDesc = TII->get(Opc); return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType); } return false; @@ -155,6 +160,35 @@ static bool updateOperand(FoldCandidate &Fold, assert(Old.isReg()); if (Fold.isImm()) { + if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked) { + // Set op_sel/op_sel_hi on this operand or bail out if op_sel is + // already set. + unsigned Opcode = MI->getOpcode(); + int OpNo = MI->getOperandNo(&Old); + int ModIdx = -1; + if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) + ModIdx = AMDGPU::OpName::src0_modifiers; + else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) + ModIdx = AMDGPU::OpName::src1_modifiers; + else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) + ModIdx = AMDGPU::OpName::src2_modifiers; + assert(ModIdx != -1); + ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx); + MachineOperand &Mod = MI->getOperand(ModIdx); + unsigned Val = Mod.getImm(); + if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1)) + return false; + // If upper part is all zero we do not need op_sel_hi. + if (!isUInt<16>(Fold.ImmToFold)) { + if (!(Fold.ImmToFold & 0xffff)) { + Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); + Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); + Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); + return true; + } + Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); + } + } Old.ChangeToImmediate(Fold.ImmToFold); return true; } @@ -195,13 +229,17 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 unsigned Opc = MI->getOpcode(); - if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64) && + if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || + Opc == AMDGPU::V_FMAC_F32_e64) && (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64; bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; + unsigned NewOpc = IsFMA ? + AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); // Check if changing this to a v_mad_{f16, f32} instruction will allow us // to fold the operand. - MI->setDesc(TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16)); + MI->setDesc(TII->get(NewOpc)); bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); if (FoldAsMAD) { MI->untieRegOperand(OpNo); @@ -345,6 +383,7 @@ void SIFoldOperands::foldOperand( // Don't fold into target independent nodes. Target independent opcodes // don't have defined register classes. if (UseDesc.isVariadic() || + UseOp.isImplicit() || UseDesc.OpInfo[UseOpIdx].RegClass == -1) return; } @@ -470,7 +509,8 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI, MachineOperand &Op) { if (Op.isReg()) { // If this has a subregister, it obviously is a register source. - if (Op.getSubReg() != AMDGPU::NoSubRegister) + if (Op.getSubReg() != AMDGPU::NoSubRegister || + !TargetRegisterInfo::isVirtualRegister(Op.getReg())) return &Op; MachineInstr *Def = MRI.getVRegDef(Op.getReg()); @@ -598,14 +638,14 @@ static bool tryFoldInst(const SIInstrInfo *TII, const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1); if (Src1->isIdenticalTo(*Src0)) { - DEBUG(dbgs() << "Folded " << *MI << " into "); + LLVM_DEBUG(dbgs() << "Folded " << *MI << " into "); int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); if (Src2Idx != -1) MI->RemoveOperand(Src2Idx); MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1)); mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false))); - DEBUG(dbgs() << *MI << '\n'); + LLVM_DEBUG(dbgs() << *MI << '\n'); return true; } } @@ -646,7 +686,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, // be folded due to multiple uses or operand constraints. if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) { - DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n'); + LLVM_DEBUG(dbgs() << "Constant folded " << *UseMI << '\n'); // Some constant folding cases change the same immediate's use to a new // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user @@ -713,8 +753,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, // copies. MRI->clearKillFlags(Fold.OpToFold->getReg()); } - DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << - static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n'); + LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " + << static_cast<int>(Fold.UseOpNo) << " of " + << *Fold.UseMI << '\n'); tryFoldInst(TII, Fold.UseMI); } else if (Fold.isCommuted()) { // Restoring instruction's original operand order if fold has failed. @@ -794,7 +835,8 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { if (!DefClamp) return false; - DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def << '\n'); + LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def + << '\n'); // Clamp is applied after omod, so it is OK if omod is set. DefClamp->setImm(1); @@ -917,7 +959,7 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) { if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp)) return false; - DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n'); + LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n'); DefOMod->setImm(OMod); MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); @@ -930,7 +972,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { return false; MRI = &MF.getRegInfo(); - ST = &MF.getSubtarget<SISubtarget>(); + ST = &MF.getSubtarget<GCNSubtarget>(); TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); diff --git a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp new file mode 100644 index 000000000000..cd14239de822 --- /dev/null +++ b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -0,0 +1,398 @@ +//===-- SIFormMemoryClauses.cpp -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass creates bundles of SMEM and VMEM instructions forming memory +/// clauses if XNACK is enabled. Def operands of clauses are marked as early +/// clobber to make sure we will not override any source within a clause. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "GCNRegPressure.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-form-memory-clauses" + +// Clauses longer then 15 instructions would overflow one of the counters +// and stall. They can stall even earlier if there are outstanding counters. +static cl::opt<unsigned> +MaxClause("amdgpu-max-memory-clause", cl::Hidden, cl::init(15), + cl::desc("Maximum length of a memory clause, instructions")); + +namespace { + +class SIFormMemoryClauses : public MachineFunctionPass { + typedef DenseMap<unsigned, std::pair<unsigned, LaneBitmask>> RegUse; + +public: + static char ID; + +public: + SIFormMemoryClauses() : MachineFunctionPass(ID) { + initializeSIFormMemoryClausesPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "SI Form memory clauses"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveIntervals>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + template <typename Callable> + void forAllLanes(unsigned Reg, LaneBitmask LaneMask, Callable Func) const; + + bool canBundle(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const; + bool checkPressure(const MachineInstr &MI, GCNDownwardRPTracker &RPT); + void collectRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses) const; + bool processRegUses(const MachineInstr &MI, RegUse &Defs, RegUse &Uses, + GCNDownwardRPTracker &RPT); + + const GCNSubtarget *ST; + const SIRegisterInfo *TRI; + const MachineRegisterInfo *MRI; + SIMachineFunctionInfo *MFI; + + unsigned LastRecordedOccupancy; + unsigned MaxVGPRs; + unsigned MaxSGPRs; +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIFormMemoryClauses, DEBUG_TYPE, + "SI Form memory clauses", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(SIFormMemoryClauses, DEBUG_TYPE, + "SI Form memory clauses", false, false) + + +char SIFormMemoryClauses::ID = 0; + +char &llvm::SIFormMemoryClausesID = SIFormMemoryClauses::ID; + +FunctionPass *llvm::createSIFormMemoryClausesPass() { + return new SIFormMemoryClauses(); +} + +static bool isVMEMClauseInst(const MachineInstr &MI) { + return SIInstrInfo::isFLAT(MI) || SIInstrInfo::isVMEM(MI); +} + +static bool isSMEMClauseInst(const MachineInstr &MI) { + return SIInstrInfo::isSMRD(MI); +} + +// There no sense to create store clauses, they do not define anything, +// thus there is nothing to set early-clobber. +static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) { + if (MI.isDebugValue() || MI.isBundled()) + return false; + if (!MI.mayLoad() || MI.mayStore()) + return false; + if (AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1 || + AMDGPU::getAtomicRetOp(MI.getOpcode()) != -1) + return false; + if (IsVMEMClause && !isVMEMClauseInst(MI)) + return false; + if (!IsVMEMClause && !isSMEMClauseInst(MI)) + return false; + return true; +} + +static unsigned getMopState(const MachineOperand &MO) { + unsigned S = 0; + if (MO.isImplicit()) + S |= RegState::Implicit; + if (MO.isDead()) + S |= RegState::Dead; + if (MO.isUndef()) + S |= RegState::Undef; + if (MO.isKill()) + S |= RegState::Kill; + if (MO.isEarlyClobber()) + S |= RegState::EarlyClobber; + if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) && MO.isRenamable()) + S |= RegState::Renamable; + return S; +} + +template <typename Callable> +void SIFormMemoryClauses::forAllLanes(unsigned Reg, LaneBitmask LaneMask, + Callable Func) const { + if (LaneMask.all() || TargetRegisterInfo::isPhysicalRegister(Reg) || + LaneMask == MRI->getMaxLaneMaskForVReg(Reg)) { + Func(0); + return; + } + + const TargetRegisterClass *RC = MRI->getRegClass(Reg); + unsigned E = TRI->getNumSubRegIndices(); + SmallVector<unsigned, AMDGPU::NUM_TARGET_SUBREGS> CoveringSubregs; + for (unsigned Idx = 1; Idx < E; ++Idx) { + // Is this index even compatible with the given class? + if (TRI->getSubClassWithSubReg(RC, Idx) != RC) + continue; + LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx); + // Early exit if we found a perfect match. + if (SubRegMask == LaneMask) { + Func(Idx); + return; + } + + if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none()) + continue; + + CoveringSubregs.push_back(Idx); + } + + llvm::sort(CoveringSubregs.begin(), CoveringSubregs.end(), + [this](unsigned A, unsigned B) { + LaneBitmask MaskA = TRI->getSubRegIndexLaneMask(A); + LaneBitmask MaskB = TRI->getSubRegIndexLaneMask(B); + unsigned NA = MaskA.getNumLanes(); + unsigned NB = MaskB.getNumLanes(); + if (NA != NB) + return NA > NB; + return MaskA.getHighestLane() > MaskB.getHighestLane(); + }); + + for (unsigned Idx : CoveringSubregs) { + LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx); + if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none()) + continue; + + Func(Idx); + LaneMask &= ~SubRegMask; + if (LaneMask.none()) + return; + } + + llvm_unreachable("Failed to find all subregs to cover lane mask"); +} + +// Returns false if there is a use of a def already in the map. +// In this case we must break the clause. +bool SIFormMemoryClauses::canBundle(const MachineInstr &MI, + RegUse &Defs, RegUse &Uses) const { + // Check interference with defs. + for (const MachineOperand &MO : MI.operands()) { + // TODO: Prologue/Epilogue Insertion pass does not process bundled + // instructions. + if (MO.isFI()) + return false; + + if (!MO.isReg()) + continue; + + unsigned Reg = MO.getReg(); + + // If it is tied we will need to write same register as we read. + if (MO.isTied()) + return false; + + RegUse &Map = MO.isDef() ? Uses : Defs; + auto Conflict = Map.find(Reg); + if (Conflict == Map.end()) + continue; + + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + return false; + + LaneBitmask Mask = TRI->getSubRegIndexLaneMask(MO.getSubReg()); + if ((Conflict->second.second & Mask).any()) + return false; + } + + return true; +} + +// Since all defs in the clause are early clobber we can run out of registers. +// Function returns false if pressure would hit the limit if instruction is +// bundled into a memory clause. +bool SIFormMemoryClauses::checkPressure(const MachineInstr &MI, + GCNDownwardRPTracker &RPT) { + // NB: skip advanceBeforeNext() call. Since all defs will be marked + // early-clobber they will all stay alive at least to the end of the + // clause. Therefor we should not decrease pressure even if load + // pointer becomes dead and could otherwise be reused for destination. + RPT.advanceToNext(); + GCNRegPressure MaxPressure = RPT.moveMaxPressure(); + unsigned Occupancy = MaxPressure.getOccupancy(*ST); + if (Occupancy >= MFI->getMinAllowedOccupancy() && + MaxPressure.getVGPRNum() <= MaxVGPRs && + MaxPressure.getSGPRNum() <= MaxSGPRs) { + LastRecordedOccupancy = Occupancy; + return true; + } + return false; +} + +// Collect register defs and uses along with their lane masks and states. +void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI, + RegUse &Defs, RegUse &Uses) const { + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; + + LaneBitmask Mask = TargetRegisterInfo::isVirtualRegister(Reg) ? + TRI->getSubRegIndexLaneMask(MO.getSubReg()) : + LaneBitmask::getAll(); + RegUse &Map = MO.isDef() ? Defs : Uses; + + auto Loc = Map.find(Reg); + unsigned State = getMopState(MO); + if (Loc == Map.end()) { + Map[Reg] = std::make_pair(State, Mask); + } else { + Loc->second.first |= State; + Loc->second.second |= Mask; + } + } +} + +// Check register def/use conflicts, occupancy limits and collect def/use maps. +// Return true if instruction can be bundled with previous. It it cannot +// def/use maps are not updated. +bool SIFormMemoryClauses::processRegUses(const MachineInstr &MI, + RegUse &Defs, RegUse &Uses, + GCNDownwardRPTracker &RPT) { + if (!canBundle(MI, Defs, Uses)) + return false; + + if (!checkPressure(MI, RPT)) + return false; + + collectRegUses(MI, Defs, Uses); + return true; +} + +bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + ST = &MF.getSubtarget<GCNSubtarget>(); + if (!ST->isXNACKEnabled()) + return false; + + const SIInstrInfo *TII = ST->getInstrInfo(); + TRI = ST->getRegisterInfo(); + MRI = &MF.getRegInfo(); + MFI = MF.getInfo<SIMachineFunctionInfo>(); + LiveIntervals *LIS = &getAnalysis<LiveIntervals>(); + SlotIndexes *Ind = LIS->getSlotIndexes(); + bool Changed = false; + + MaxVGPRs = TRI->getAllocatableSet(MF, &AMDGPU::VGPR_32RegClass).count(); + MaxSGPRs = TRI->getAllocatableSet(MF, &AMDGPU::SGPR_32RegClass).count(); + + for (MachineBasicBlock &MBB : MF) { + MachineBasicBlock::instr_iterator Next; + for (auto I = MBB.instr_begin(), E = MBB.instr_end(); I != E; I = Next) { + MachineInstr &MI = *I; + Next = std::next(I); + + bool IsVMEM = isVMEMClauseInst(MI); + + if (!isValidClauseInst(MI, IsVMEM)) + continue; + + RegUse Defs, Uses; + GCNDownwardRPTracker RPT(*LIS); + RPT.reset(MI); + + if (!processRegUses(MI, Defs, Uses, RPT)) + continue; + + unsigned Length = 1; + for ( ; Next != E && Length < MaxClause; ++Next) { + if (!isValidClauseInst(*Next, IsVMEM)) + break; + + // A load from pointer which was loaded inside the same bundle is an + // impossible clause because we will need to write and read the same + // register inside. In this case processRegUses will return false. + if (!processRegUses(*Next, Defs, Uses, RPT)) + break; + + ++Length; + } + if (Length < 2) + continue; + + Changed = true; + MFI->limitOccupancy(LastRecordedOccupancy); + + auto B = BuildMI(MBB, I, DebugLoc(), TII->get(TargetOpcode::BUNDLE)); + Ind->insertMachineInstrInMaps(*B); + + for (auto BI = I; BI != Next; ++BI) { + BI->bundleWithPred(); + Ind->removeSingleMachineInstrFromMaps(*BI); + + for (MachineOperand &MO : BI->defs()) + if (MO.readsReg()) + MO.setIsInternalRead(true); + } + + for (auto &&R : Defs) { + forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) { + unsigned S = R.second.first | RegState::EarlyClobber; + if (!SubReg) + S &= ~(RegState::Undef | RegState::Dead); + B.addDef(R.first, S, SubReg); + }); + } + + for (auto &&R : Uses) { + forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) { + B.addUse(R.first, R.second.first & ~RegState::Kill, SubReg); + }); + } + + for (auto &&R : Defs) { + unsigned Reg = R.first; + Uses.erase(Reg); + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + continue; + LIS->removeInterval(Reg); + LIS->createAndComputeVirtRegInterval(Reg); + } + + for (auto &&R : Uses) { + unsigned Reg = R.first; + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + continue; + LIS->removeInterval(Reg); + LIS->createAndComputeVirtRegInterval(Reg); + } + } + } + + return Changed; +} diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index 89bb98dbd028..ac0ef90f25a4 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -12,7 +12,9 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -21,19 +23,19 @@ using namespace llvm; -static ArrayRef<MCPhysReg> getAllSGPR128(const SISubtarget &ST, +static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST, const MachineFunction &MF) { return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4); } -static ArrayRef<MCPhysReg> getAllSGPRs(const SISubtarget &ST, +static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST, const MachineFunction &MF) { return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF)); } -void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST, +void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, MachineFunction &MF, MachineBasicBlock &MBB) const { const SIInstrInfo *TII = ST.getInstrInfo(); @@ -96,7 +98,7 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST, } unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( - const SISubtarget &ST, + const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, SIMachineFunctionInfo *MFI, @@ -147,7 +149,7 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( // SGPRs. std::pair<unsigned, unsigned> SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( - const SISubtarget &ST, + const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, SIMachineFunctionInfo *MFI, @@ -218,7 +220,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was // specified. - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (ST.debuggerEmitPrologue()) emitDebuggerPrologue(MF, MBB); @@ -235,6 +237,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); + const Function &F = MF.getFunction(); // We need to do the replacement of the private segment buffer and wave offset // register even if there are no stack objects. There could be stores to undef @@ -286,7 +289,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; - if (ST.isAmdCodeObjectV2(MF)) { + if (ST.isAmdCodeObjectV2(F)) { PreloadedPrivateBufferReg = MFI->getPreloadedReg( AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); } @@ -305,7 +308,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, } if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) { - assert(ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)); + assert(ST.isAmdCodeObjectV2(F) || ST.isMesaGfxShader(F)); MRI.addLiveIn(PreloadedPrivateBufferReg); MBB.addLiveIn(PreloadedPrivateBufferReg); } @@ -330,7 +333,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, bool CopyBuffer = ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister && - ST.isAmdCodeObjectV2(MF) && + ST.isAmdCodeObjectV2(F) && ScratchRsrcReg != PreloadedPrivateBufferReg; // This needs to be careful of the copying order to avoid overwriting one of @@ -361,13 +364,14 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, } // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. -void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST, +void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI, MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg, unsigned ScratchRsrcReg) const { const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const Function &Fn = MF.getFunction(); DebugLoc DL; if (ST.isAmdPalOS()) { @@ -387,12 +391,27 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST, const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); BuildMI(MBB, I, DL, GetPC64, Rsrc01); } + auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in + if (ST.hasMergedShaders()) { + switch (MF.getFunction().getCallingConv()) { + case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_GS: + // Low GIT address is passed in s8 rather than s0 for an LS+HS or + // ES+GS merged shader on gfx9+. + GitPtrLo = AMDGPU::SGPR8; + break; + default: + break; + } + } + MF.getRegInfo().addLiveIn(GitPtrLo); + MF.front().addLiveIn(GitPtrLo); BuildMI(MBB, I, DL, SMovB32, RsrcLo) - .addReg(AMDGPU::SGPR0) // Low address passed in + .addReg(GitPtrLo) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); // We now have the GIT ptr - now get the scratch descriptor from the entry - // at offset 0. + // at offset 0 (or offset 16 for a compute shader). PointerType *PtrTy = PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()), AMDGPUAS::CONSTANT_ADDRESS); @@ -403,17 +422,18 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST, MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable, 0, 0); + unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0; BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) .addReg(Rsrc01) - .addImm(0) // offset + .addImm(Offset) // offset .addImm(0) // glc .addReg(ScratchRsrcReg, RegState::ImplicitDefine) .addMemOperand(MMO); return; } - if (ST.isMesaGfxShader(MF) + if (ST.isMesaGfxShader(Fn) || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) { - assert(!ST.isAmdCodeObjectV2(MF)); + assert(!ST.isAmdCodeObjectV2(Fn)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); @@ -474,17 +494,52 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST, } } +// Find a scratch register that we can use at the start of the prologue to +// re-align the stack pointer. We avoid using callee-save registers since they +// may appear to be free when this is called from canUseAsPrologue (during +// shrink wrapping), but then no longer be free when this is called from +// emitPrologue. +// +// FIXME: This is a bit conservative, since in the above case we could use one +// of the callee-save registers as a scratch temp to re-align the stack pointer, +// but we would then have to make sure that we were in fact saving at least one +// callee-save register in the prologue, which is additional complexity that +// doesn't seem worth the benefit. +static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) { + MachineFunction *MF = MBB.getParent(); + + const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo(); + LivePhysRegs LiveRegs(TRI); + LiveRegs.addLiveIns(MBB); + + // Mark callee saved registers as used so we will not choose them. + const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF); + for (unsigned i = 0; CSRegs[i]; ++i) + LiveRegs.addReg(CSRegs[i]); + + MachineRegisterInfo &MRI = MF->getRegInfo(); + + for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) { + if (LiveRegs.available(MRI, Reg)) + return Reg; + } + + return AMDGPU::NoRegister; +} + void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); if (FuncInfo->isEntryFunction()) { emitEntryFunctionPrologue(MF, MBB); return; } const MachineFrameInfo &MFI = MF.getFrameInfo(); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); unsigned FramePtrReg = FuncInfo->getFrameOffsetReg(); @@ -492,8 +547,34 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock::iterator MBBI = MBB.begin(); DebugLoc DL; + // XXX - Is this the right predicate? + bool NeedFP = hasFP(MF); - if (NeedFP) { + uint32_t NumBytes = MFI.getStackSize(); + uint32_t RoundedSize = NumBytes; + const bool NeedsRealignment = TRI.needsStackRealignment(MF); + + if (NeedsRealignment) { + assert(NeedFP); + const unsigned Alignment = MFI.getMaxAlignment(); + + RoundedSize += Alignment; + + unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB); + assert(ScratchSPReg != AMDGPU::NoRegister); + + // s_add_u32 tmp_reg, s32, NumBytes + // s_and_b32 s32, tmp_reg, 0b111...0000 + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg) + .addReg(StackPtrReg) + .addImm((Alignment - 1) * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameSetup); + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg) + .addReg(ScratchSPReg, RegState::Kill) + .addImm(-Alignment * ST.getWavefrontSize()) + .setMIFlag(MachineInstr::FrameSetup); + FuncInfo->setIsStackRealigned(true); + } else if (NeedFP) { // If we need a base pointer, set it up here. It's whatever the value of // the stack pointer is at this point. Any variable size objects will be // allocated after this, so we can still use the base pointer to reference @@ -503,11 +584,10 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } - uint32_t NumBytes = MFI.getStackSize(); - if (NumBytes != 0 && hasSP(MF)) { + if (RoundedSize != 0 && hasSP(MF)) { BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg) .addReg(StackPtrReg) - .addImm(NumBytes * ST.getWavefrontSize()) + .addImm(RoundedSize * ST.getWavefrontSize()) .setMIFlag(MachineInstr::FrameSetup); } @@ -527,7 +607,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, if (FuncInfo->isEntryFunction()) return; - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); @@ -553,10 +633,12 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, // it's really whether we need SP to be accurate or not. if (NumBytes != 0 && hasSP(MF)) { + uint32_t RoundedSize = FuncInfo->isStackRealigned() ? + NumBytes + MFI.getMaxAlignment() : NumBytes; + BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg) .addReg(StackPtrReg) - .addImm(NumBytes * ST.getWavefrontSize()) - .setMIFlag(MachineInstr::FrameDestroy); + .addImm(RoundedSize * ST.getWavefrontSize()); } } @@ -572,7 +654,7 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) { int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { - const SIRegisterInfo *RI = MF.getSubtarget<SISubtarget>().getRegisterInfo(); + const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); FrameReg = RI->getFrameRegister(MF); return MF.getFrameInfo().getObjectOffset(FI); @@ -586,7 +668,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( if (!MFI.hasStackObjects()) return; - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); @@ -611,6 +693,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( if (TII->isSGPRSpill(MI)) { int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); + assert(MFI.getStackID(FI) == SIStackID::SGPR_SPILL); if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) { bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS); (void)Spilled; @@ -667,7 +750,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( if (Amount == 0) return MBB.erase(I); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const DebugLoc &DL = I->getDebugLoc(); unsigned Opc = I->getOpcode(); @@ -696,7 +779,7 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); @@ -746,7 +829,8 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const { } bool SIFrameLowering::hasSP(const MachineFunction &MF) const { + const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo(); // All stack operations are relative to the frame offset SGPR. const MachineFrameInfo &MFI = MF.getFrameInfo(); - return MFI.hasCalls() || MFI.hasVarSizedObjects(); + return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF); } diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h index df6f1632a316..2f35b3631cdc 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.h +++ b/lib/Target/AMDGPU/SIFrameLowering.h @@ -17,7 +17,7 @@ namespace llvm { class SIInstrInfo; class SIMachineFunctionInfo; class SIRegisterInfo; -class SISubtarget; +class GCNSubtarget; class SIFrameLowering final : public AMDGPUFrameLowering { public: @@ -48,29 +48,29 @@ public: MachineBasicBlock::iterator MI) const override; private: - void emitFlatScratchInit(const SISubtarget &ST, + void emitFlatScratchInit(const GCNSubtarget &ST, MachineFunction &MF, MachineBasicBlock &MBB) const; unsigned getReservedPrivateSegmentBufferReg( - const SISubtarget &ST, + const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, SIMachineFunctionInfo *MFI, MachineFunction &MF) const; std::pair<unsigned, unsigned> getReservedPrivateSegmentWaveByteOffsetReg( - const SISubtarget &ST, + const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, SIMachineFunctionInfo *MFI, MachineFunction &MF) const; - /// \brief Emits debugger prologue. + /// Emits debugger prologue. void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const; // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. - void emitEntryFunctionScratchSetup(const SISubtarget &ST, MachineFunction &MF, + void emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI, MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg, unsigned ScratchRsrcReg) const; diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 50ee88fa635a..5b7fc2656a20 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Custom DAG lowering for SI +/// Custom DAG lowering for SI // //===----------------------------------------------------------------------===// @@ -26,6 +26,7 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" @@ -49,7 +50,6 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetCallingConv.h" @@ -73,6 +73,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" +#include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" #include <cassert> @@ -111,8 +112,9 @@ static unsigned findFirstFreeSGPR(CCState &CCInfo) { } SITargetLowering::SITargetLowering(const TargetMachine &TM, - const SISubtarget &STI) - : AMDGPUTargetLowering(TM, STI) { + const GCNSubtarget &STI) + : AMDGPUTargetLowering(TM, STI), + Subtarget(&STI) { addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); @@ -138,14 +140,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->has16BitInsts()) { addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass); addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass); - } - if (Subtarget->hasVOP3PInsts()) { + // Unless there are also VOP3P operations, not operations are really legal. addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass); addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass); + addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass); + addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); } - computeRegisterProperties(STI.getRegisterInfo()); + computeRegisterProperties(Subtarget->getRegisterInfo()); // We need to custom lower vector stores from local memory setOperationAction(ISD::LOAD, MVT::v2i32, Custom); @@ -173,7 +176,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); - setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); setOperationAction(ISD::SELECT, MVT::i1, Promote); setOperationAction(ISD::SELECT, MVT::i64, Custom); @@ -205,13 +207,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i1, Expand); @@ -231,13 +237,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SUBCARRY, MVT::i64, Legal); #endif - //setOperationAction(ISD::ADDC, MVT::i64, Expand); - //setOperationAction(ISD::SUBC, MVT::i64, Expand); - // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, - MVT::v2i64, MVT::v2f64}) { + MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16 }) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -260,6 +263,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } } + setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand); + // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that // is expanded to avoid having two separate loops in case the index is a VGPR. @@ -284,12 +289,30 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); + // Avoid stack access for these. // TODO: Generalize to more vector types. setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom); + + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom); // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, // and output demarshalling @@ -301,7 +324,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand); - if (getSubtarget()->hasFlatAddressSpace()) { + if (Subtarget->hasFlatAddressSpace()) { setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); } @@ -314,13 +337,56 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::TRAP, MVT::Other, Custom); setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom); + if (Subtarget->has16BitInsts()) { + setOperationAction(ISD::FLOG, MVT::f16, Custom); + setOperationAction(ISD::FLOG10, MVT::f16, Custom); + } + + // v_mad_f32 does not support denormals according to some sources. + if (!Subtarget->hasFP32Denormals()) + setOperationAction(ISD::FMAD, MVT::f32, Legal); + + if (!Subtarget->hasBFI()) { + // fcopysign can be done in a single instruction with BFI. + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); + } + + if (!Subtarget->hasBCNT(32)) + setOperationAction(ISD::CTPOP, MVT::i32, Expand); + + if (!Subtarget->hasBCNT(64)) + setOperationAction(ISD::CTPOP, MVT::i64, Expand); + + if (Subtarget->hasFFBH()) + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); + + if (Subtarget->hasFFBL()) + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom); + + // We only really have 32-bit BFE instructions (and 16-bit on VI). + // + // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any + // effort to match them now. We want this to be false for i64 cases when the + // extraction isn't restricted to the upper or lower half. Ideally we would + // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that + // span the midpoint are probably relatively rare, so don't worry about them + // for now. + if (Subtarget->hasBFE()) + setHasExtractBitsInsn(true); + setOperationAction(ISD::FMINNUM, MVT::f64, Legal); setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); - if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) { + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FRINT, MVT::f64, Legal); + } else { + setOperationAction(ISD::FCEIL, MVT::f64, Custom); + setOperationAction(ISD::FTRUNC, MVT::f64, Custom); + setOperationAction(ISD::FRINT, MVT::f64, Custom); + setOperationAction(ISD::FFLOOR, MVT::f64, Custom); } setOperationAction(ISD::FFLOOR, MVT::f64, Legal); @@ -357,6 +423,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote); setOperationAction(ISD::CTLZ, MVT::i16, Promote); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote); + setOperationAction(ISD::CTPOP, MVT::i16, Promote); setOperationAction(ISD::SELECT_CC, MVT::i16, Expand); @@ -406,10 +473,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMA, MVT::f16, Legal); if (!Subtarget->hasFP16Denormals()) setOperationAction(ISD::FMAD, MVT::f16, Legal); - } - if (Subtarget->hasVOP3PInsts()) { - for (MVT VT : {MVT::v2i16, MVT::v2f16}) { + for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -436,6 +501,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::Constant, MVT::v2i16, Legal); setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal); + setOperationAction(ISD::UNDEF, MVT::v2i16, Legal); + setOperationAction(ISD::UNDEF, MVT::v2f16, Legal); + setOperationAction(ISD::STORE, MVT::v2i16, Promote); AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32); setOperationAction(ISD::STORE, MVT::v2f16, Promote); @@ -452,11 +520,38 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32); setOperationAction(ISD::XOR, MVT::v2i16, Promote); AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32); - setOperationAction(ISD::SELECT, MVT::v2i16, Promote); - AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); - setOperationAction(ISD::SELECT, MVT::v2f16, Promote); - AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); + setOperationAction(ISD::LOAD, MVT::v4i16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::v4f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32); + + setOperationAction(ISD::STORE, MVT::v4i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v4f16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); + + setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); + + setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand); + setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand); + + if (!Subtarget->hasVOP3PInsts()) { + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); + } + + setOperationAction(ISD::FNEG, MVT::v2f16, Legal); + // This isn't really legal, but this avoids the legalizer unrolling it (and + // allows matching fneg (fabs x) patterns) + setOperationAction(ISD::FABS, MVT::v2f16, Legal); + } + + if (Subtarget->hasVOP3PInsts()) { setOperationAction(ISD::ADD, MVT::v2i16, Legal); setOperationAction(ISD::SUB, MVT::v2i16, Legal); setOperationAction(ISD::MUL, MVT::v2i16, Legal); @@ -469,26 +564,51 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::UMAX, MVT::v2i16, Legal); setOperationAction(ISD::FADD, MVT::v2f16, Legal); - setOperationAction(ISD::FNEG, MVT::v2f16, Legal); setOperationAction(ISD::FMUL, MVT::v2f16, Legal); setOperationAction(ISD::FMA, MVT::v2f16, Legal); setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal); setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal); - - // This isn't really legal, but this avoids the legalizer unrolling it (and - // allows matching fneg (fabs x) patterns) - setOperationAction(ISD::FABS, MVT::v2f16, Legal); + setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); - setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand); - setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); - setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); - setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); + setOperationAction(ISD::SHL, MVT::v4i16, Custom); + setOperationAction(ISD::SRA, MVT::v4i16, Custom); + setOperationAction(ISD::SRL, MVT::v4i16, Custom); + setOperationAction(ISD::ADD, MVT::v4i16, Custom); + setOperationAction(ISD::SUB, MVT::v4i16, Custom); + setOperationAction(ISD::MUL, MVT::v4i16, Custom); + + setOperationAction(ISD::SMIN, MVT::v4i16, Custom); + setOperationAction(ISD::SMAX, MVT::v4i16, Custom); + setOperationAction(ISD::UMIN, MVT::v4i16, Custom); + setOperationAction(ISD::UMAX, MVT::v4i16, Custom); + + setOperationAction(ISD::FADD, MVT::v4f16, Custom); + setOperationAction(ISD::FMUL, MVT::v4f16, Custom); + setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom); + setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom); + + setOperationAction(ISD::SELECT, MVT::v4i16, Custom); + setOperationAction(ISD::SELECT, MVT::v4f16, Custom); + } + + setOperationAction(ISD::FNEG, MVT::v4f16, Custom); + setOperationAction(ISD::FABS, MVT::v4f16, Custom); + + if (Subtarget->has16BitInsts()) { + setOperationAction(ISD::SELECT, MVT::v2i16, Promote); + AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32); + setOperationAction(ISD::SELECT, MVT::v2f16, Promote); + AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32); } else { + // Legalization hack. setOperationAction(ISD::SELECT, MVT::v2i16, Custom); setOperationAction(ISD::SELECT, MVT::v2f16, Custom); + + setOperationAction(ISD::FNEG, MVT::v2f16, Custom); + setOperationAction(ISD::FABS, MVT::v2f16, Custom); } for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) { @@ -503,6 +623,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); setTargetDAGCombine(ISD::FMAXNUM); + setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::SMIN); setTargetDAGCombine(ISD::SMAX); setTargetDAGCombine(ISD::UMIN); @@ -540,16 +661,33 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); setSchedulingPreference(Sched::RegPressure); + + // SI at least has hardware support for floating point exceptions, but no way + // of using or handling them is implemented. They are also optional in OpenCL + // (Section 7.3) + setHasFloatingPointExceptions(Subtarget->hasFPExceptions()); } -const SISubtarget *SITargetLowering::getSubtarget() const { - return static_cast<const SISubtarget *>(Subtarget); +const GCNSubtarget *SITargetLowering::getSubtarget() const { + return Subtarget; } //===----------------------------------------------------------------------===// // TargetLowering queries //===----------------------------------------------------------------------===// +// v_mad_mix* support a conversion from f16 to f32. +// +// There is only one special case when denormals are enabled we don't currently, +// where this is OK to use. +bool SITargetLowering::isFPExtFoldable(unsigned Opcode, + EVT DestVT, EVT SrcVT) const { + return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || + (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && + DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() && + SrcVT.getScalarType() == MVT::f16; +} + bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { // SI has some legal vector types, but no legal vector operations. Say no // shuffles are legal in order to prefer scalarizing some vector operations. @@ -560,9 +698,55 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, unsigned IntrID) const { + if (const AMDGPU::RsrcIntrinsic *RsrcIntr = + AMDGPU::lookupRsrcIntrinsic(IntrID)) { + AttributeList Attr = Intrinsic::getAttributes(CI.getContext(), + (Intrinsic::ID)IntrID); + if (Attr.hasFnAttribute(Attribute::ReadNone)) + return false; + + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + if (RsrcIntr->IsImage) { + Info.ptrVal = MFI->getImagePSV( + *MF.getSubtarget<GCNSubtarget>().getInstrInfo(), + CI.getArgOperand(RsrcIntr->RsrcArg)); + Info.align = 0; + } else { + Info.ptrVal = MFI->getBufferPSV( + *MF.getSubtarget<GCNSubtarget>().getInstrInfo(), + CI.getArgOperand(RsrcIntr->RsrcArg)); + } + + Info.flags = MachineMemOperand::MODereferenceable; + if (Attr.hasFnAttribute(Attribute::ReadOnly)) { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.flags |= MachineMemOperand::MOLoad; + } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType()); + Info.flags |= MachineMemOperand::MOStore; + } else { + // Atomic + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable; + + // XXX - Should this be volatile without known ordering? + Info.flags |= MachineMemOperand::MOVolatile; + } + return true; + } + switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: { + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); @@ -575,6 +759,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return true; } + default: return false; } @@ -585,7 +770,10 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, Type *&AccessTy) const { switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: { + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: { Value *Ptr = II->getArgOperand(0); AccessTy = II->getType(); Ops.push_back(Ptr); @@ -675,7 +863,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, if (AS == AMDGPUASI.GLOBAL_ADDRESS) return isLegalGlobalAddressingMode(AM); - if (AS == AMDGPUASI.CONSTANT_ADDRESS) { + if (AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) { // If the offset isn't a multiple of 4, it probably isn't going to be // correctly aligned. // FIXME: Can we get the real alignment here? @@ -686,19 +875,19 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, // will use a MUBUF load. // FIXME?: We also need to do this if unaligned, but we don't know the // alignment here. - if (DL.getTypeStoreSize(Ty) < 4) + if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4) return isLegalGlobalAddressingMode(AM); - if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { // SMRD instructions have an 8-bit, dword offset on SI. if (!isUInt<8>(AM.BaseOffs / 4)) return false; - } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) { + } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) { // On CI+, this can also be a 32-bit literal constant offset. If it fits // in 8-bits, it can use a smaller encoding. if (!isUInt<32>(AM.BaseOffs / 4)) return false; - } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { + } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { // On VI, these use the SMEM format and the offset is 20-bit in bytes. if (!isUInt<20>(AM.BaseOffs)) return false; @@ -798,7 +987,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // If we have an uniform constant load, it still requires using a slow // buffer instruction if unaligned. if (IsFast) { - *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ? + *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS || + AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ? (Align % 4 == 0) : true; } @@ -841,7 +1031,8 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) { return AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS || - AS == AMDGPUASI.CONSTANT_ADDRESS; + AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT; } bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, @@ -853,7 +1044,7 @@ bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { const MemSDNode *MemNode = cast<MemSDNode>(N); const Value *Ptr = MemNode->getMemOperand()->getValue(); - const Instruction *I = dyn_cast<Instruction>(Ptr); + const Instruction *I = dyn_cast_or_null<Instruction>(Ptr); return I && I->getMetadata("amdgpu.noclobber"); } @@ -870,7 +1061,7 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS, bool SITargetLowering::isMemOpUniform(const SDNode *N) const { const MemSDNode *MemNode = cast<MemSDNode>(N); - return AMDGPU::isUniformMMO(MemNode->getMemOperand()); + return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand()); } TargetLoweringBase::LegalizeTypeAction @@ -932,14 +1123,13 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT); - return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(Offset, SL, PtrVT)); + return DAG.getObjectPtrOffset(SL, BasePtr, Offset); } SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const { - auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>(); - uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); + uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(), + FIRST_IMPLICIT); return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset); } @@ -966,18 +1156,42 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, SDValue SITargetLowering::lowerKernargMemParameter( SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, - uint64_t Offset, bool Signed, + uint64_t Offset, unsigned Align, bool Signed, const ISD::InputArg *Arg) const { - const DataLayout &DL = DAG.getDataLayout(); Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); - unsigned Align = DL.getABITypeAlignment(Ty); + // Try to avoid using an extload by loading earlier than the argument address, + // and extracting the relevant bits. The load should hopefully be merged with + // the previous argument. + if (MemVT.getStoreSize() < 4 && Align < 4) { + // TODO: Handle align < 4 and size >= 4 (can happen with packed structs). + int64_t AlignDownOffset = alignDown(Offset, 4); + int64_t OffsetDiff = Offset - AlignDownOffset; + + EVT IntVT = MemVT.changeTypeToInteger(); + + // TODO: If we passed in the base kernel offset we could have a better + // alignment than 4, but we don't really need it. + SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset); + SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4, + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); + + SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32); + SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt); + + SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract); + ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal); + ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg); + + + return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL); + } SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align, - MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); @@ -1052,36 +1266,51 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, FunctionType *FType, SIMachineFunctionInfo *Info) { for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { - const ISD::InputArg &Arg = Ins[I]; + const ISD::InputArg *Arg = &Ins[I]; // First check if it's a PS input addr. - if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() && - !Arg.Flags.isByVal() && PSInputNum <= 15) { + if (CallConv == CallingConv::AMDGPU_PS && + !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) { + + bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum); + + // Inconveniently only the first part of the split is marked as isSplit, + // so skip to the end. We only want to increment PSInputNum once for the + // entire split argument. + if (Arg->Flags.isSplit()) { + while (!Arg->Flags.isSplitEnd()) { + assert(!Arg->VT.isVector() && + "unexpected vector split in ps argument type"); + if (!SkipArg) + Splits.push_back(*Arg); + Arg = &Ins[++I]; + } + } - if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { + if (SkipArg) { // We can safely skip PS inputs. - Skipped.set(I); + Skipped.set(Arg->getOrigArgIndex()); ++PSInputNum; continue; } Info->markPSInputAllocated(PSInputNum); - if (Arg.Used) + if (Arg->Used) Info->markPSInputEnabled(PSInputNum); ++PSInputNum; } // Second split vertices into their elements. - if (Arg.VT.isVector()) { - ISD::InputArg NewArg = Arg; + if (Arg->VT.isVector()) { + ISD::InputArg NewArg = *Arg; NewArg.Flags.setSplit(); - NewArg.VT = Arg.VT.getVectorElementType(); + NewArg.VT = Arg->VT.getVectorElementType(); // We REALLY want the ORIGINAL number of vertex elements here, e.g. a // three or five element vertex only needs three or five registers, // NOT four or eight. - Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); + Type *ParamType = FType->getParamType(Arg->getOrigArgIndex()); unsigned NumElements = ParamType->getVectorNumElements(); for (unsigned J = 0; J != NumElements; ++J) { @@ -1089,7 +1318,7 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, NewArg.PartOffset += NewArg.VT.getStoreSize(); } } else { - Splits.push_back(Arg); + Splits.push_back(*Arg); } } } @@ -1347,8 +1576,8 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // the scratch registers to pass in. bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - if (ST.isAmdCodeObjectV2(MF)) { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + if (ST.isAmdCodeObjectV2(MF.getFunction())) { if (RequiresStackAccess) { // If we have stack objects, we unquestionably need the private buffer // resource. For the Code Object V2 ABI, this will be the first 4 user @@ -1460,12 +1689,12 @@ SDValue SITargetLowering::LowerFormalArguments( const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); MachineFunction &MF = DAG.getMachineFunction(); + const Function &Fn = MF.getFunction(); FunctionType *FType = MF.getFunction().getFunctionType(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { - const Function &Fn = MF.getFunction(); DiagnosticInfoUnsupported NoGraphicsHSA( Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); DAG.getContext()->diagnose(NoGraphicsHSA); @@ -1562,9 +1791,16 @@ SDValue SITargetLowering::LowerFormalArguments( SmallVector<SDValue, 16> Chains; - for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { + // FIXME: This is the minimum kernel argument alignment. We should improve + // this to the maximum alignment of the arguments. + // + // FIXME: Alignment of explicit arguments totally broken with non-0 explicit + // kern arg offset. + const unsigned KernelArgBaseAlign = 16; + + for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { const ISD::InputArg &Arg = Ins[i]; - if (Skipped[i]) { + if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) { InVals.push_back(DAG.getUNDEF(Arg.VT)); continue; } @@ -1576,19 +1812,16 @@ SDValue SITargetLowering::LowerFormalArguments( VT = Ins[i].VT; EVT MemVT = VA.getLocVT(); - const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) + - VA.getLocMemOffset(); - Info->setABIArgOffset(Offset + MemVT.getStoreSize()); + const uint64_t Offset = VA.getLocMemOffset(); + unsigned Align = MinAlign(KernelArgBaseAlign, Offset); - // The first 36 bytes of the input buffer contains information about - // thread group and global sizes. SDValue Arg = lowerKernargMemParameter( - DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]); + DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]); Chains.push_back(Arg.getValue(1)); auto *ParamTy = dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); - if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { // On SI local pointers are just offsets into LDS, so they are always // less than 16-bits. On CI and newer they could potentially be @@ -1696,7 +1929,7 @@ SDValue SITargetLowering::LowerFormalArguments( auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); - ArgUsageInfo.setFuncArgInfo(MF.getFunction(), Info->getArgInfo()); + ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo()); unsigned StackArgSize = CCInfo.getNextStackOffset(); Info->setBytesInStackArgArea(StackArgSize); @@ -1841,8 +2074,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // FIXME: Does sret work properly? if (!Info->isEntryFunction()) { - const SIRegisterInfo *TRI - = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); if (I) { @@ -1944,8 +2176,7 @@ void SITargetLowering::passSpecialInputs( SelectionDAG &DAG = CLI.DAG; const SDLoc &DL = CLI.DL; - const SISubtarget *ST = getSubtarget(); - const SIRegisterInfo *TRI = ST->getRegisterInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); @@ -2138,6 +2369,13 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, "unsupported required tail call to function "); } + if (AMDGPU::isShader(MF.getFunction().getCallingConv())) { + // Note the issue is with the CC of the calling function, not of the call + // itself. + return lowerUnhandledCall(CLI, InVals, + "unsupported call from graphics shader of function "); + } + // The first 4 bytes are reserved for the callee's emergency stack slot. const unsigned CalleeUsableStackOffset = 4; @@ -2383,7 +2621,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // Add a register mask operand representing the call-preserved registers. - const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo(); + auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -2443,7 +2681,7 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, } - if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { report_fatal_error(Twine("invalid register \"" + StringRef(RegName) + "\" for subtarget.")); @@ -2517,7 +2755,8 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( unsigned PhiReg, unsigned InitSaveExecReg, int Offset, - bool UseGPRIdxMode) { + bool UseGPRIdxMode, + bool IsIndirectSrc) { MachineBasicBlock::iterator I = LoopBB.begin(); unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); @@ -2546,6 +2785,12 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( .addReg(CurrentIdxReg) .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg()); + // Update EXEC, save the original EXEC value to VCC. + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec) + .addReg(CondReg, RegState::Kill); + + MRI.setSimpleHint(NewExec, CondReg); + if (UseGPRIdxMode) { unsigned IdxReg; if (Offset == 0) { @@ -2556,11 +2801,13 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( .addReg(CurrentIdxReg, RegState::Kill) .addImm(Offset); } - - MachineInstr *SetIdx = - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX)) - .addReg(IdxReg, RegState::Kill); - SetIdx->getOperand(2).setIsUndef(); + unsigned IdxMode = IsIndirectSrc ? + VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; + MachineInstr *SetOn = + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) + .addReg(IdxReg, RegState::Kill) + .addImm(IdxMode); + SetOn->getOperand(3).setIsUndef(); } else { // Move index from VCC into M0 if (Offset == 0) { @@ -2573,12 +2820,6 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( } } - // Update EXEC, save the original EXEC value to VCC. - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec) - .addReg(CondReg, RegState::Kill); - - MRI.setSimpleHint(NewExec, CondReg); - // Update EXEC, switch all done bits to 0 and all todo bits to 1. MachineInstr *InsertPt = BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) @@ -2606,7 +2847,8 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, unsigned InitResultReg, unsigned PhiReg, int Offset, - bool UseGPRIdxMode) { + bool UseGPRIdxMode, + bool IsIndirectSrc) { MachineFunction *MF = MBB.getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); const DebugLoc &DL = MI.getDebugLoc(); @@ -2645,7 +2887,7 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx, InitResultReg, DstReg, PhiReg, TmpExec, - Offset, UseGPRIdxMode); + Offset, UseGPRIdxMode, IsIndirectSrc); MachineBasicBlock::iterator First = RemainderBB->begin(); BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) @@ -2730,7 +2972,7 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, // Control flow needs to be inserted if indexing with a VGPR. static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, - const SISubtarget &ST) { + const GCNSubtarget &ST) { const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); MachineFunction *MF = MBB.getParent(); @@ -2780,17 +3022,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg); - if (UseGPRIdxMode) { - MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) - .addImm(0) // Reset inside loop. - .addImm(VGPRIndexMode::SRC0_ENABLE); - SetOn->getOperand(3).setIsUndef(); - - // Disable again after the loop. - BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); - } - - auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode); + auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, + Offset, UseGPRIdxMode, true); MachineBasicBlock *LoopBB = InsPt->getParent(); if (UseGPRIdxMode) { @@ -2798,6 +3031,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, .addReg(SrcReg, RegState::Undef, SubReg) .addReg(SrcReg, RegState::Implicit) .addReg(AMDGPU::M0, RegState::Implicit); + BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); } else { BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) .addReg(SrcReg, RegState::Undef, SubReg) @@ -2829,7 +3063,7 @@ static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI, static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, - const SISubtarget &ST) { + const GCNSubtarget &ST) { const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); MachineFunction *MF = MBB.getParent(); @@ -2898,22 +3132,10 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, const DebugLoc &DL = MI.getDebugLoc(); - if (UseGPRIdxMode) { - MachineBasicBlock::iterator I(&MI); - - MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) - .addImm(0) // Reset inside loop. - .addImm(VGPRIndexMode::DST_ENABLE); - SetOn->getOperand(3).setIsUndef(); - - // Disable again after the loop. - BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); - } - unsigned PhiReg = MRI.createVirtualRegister(VecRC); auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, - Offset, UseGPRIdxMode); + Offset, UseGPRIdxMode, false); MachineBasicBlock *LoopBB = InsPt->getParent(); if (UseGPRIdxMode) { @@ -2923,6 +3145,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, .addReg(Dst, RegState::ImplicitDefine) .addReg(PhiReg, RegState::Implicit) .addReg(AMDGPU::M0, RegState::Implicit); + BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); } else { const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC)); @@ -2946,24 +3169,12 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); if (TII->isMIMG(MI)) { - if (!MI.memoperands_empty()) - return BB; + if (MI.memoperands_empty() && MI.mayLoadOrStore()) { + report_fatal_error("missing mem operand from MIMG instruction"); + } // Add a memoperand for mimg instructions so that they aren't assumed to // be ordered memory instuctions. - MachinePointerInfo PtrInfo(MFI->getImagePSV()); - MachineMemOperand::Flags Flags = MachineMemOperand::MODereferenceable; - if (MI.mayStore()) - Flags |= MachineMemOperand::MOStore; - - if (MI.mayLoad()) - Flags |= MachineMemOperand::MOLoad; - - if (Flags != MachineMemOperand::MODereferenceable) { - auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0); - MI.addMemOperand(*MF, MMO); - } - return BB; } @@ -3145,8 +3356,13 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( case AMDGPU::ADJCALLSTACKDOWN: { const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); MachineInstrBuilder MIB(*MF, &MI); + + // Add an implicit use of the frame offset reg to prevent the restore copy + // inserted after the call from being reorderd after stack operations in the + // the caller's frame. MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine) - .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit); + .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit) + .addReg(Info->getFrameOffsetReg(), RegState::Implicit); return BB; } case AMDGPU::SI_CALL_ISEL: @@ -3236,12 +3452,17 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { VT = VT.getScalarType(); switch (VT.getSimpleVT().SimpleTy) { - case MVT::f32: + case MVT::f32: { // This is as fast on some subtargets. However, we always have full rate f32 // mad available which returns the same result as the separate operations // which we should prefer over fma. We can't use this if we want to support // denormals, so only report this in these cases. - return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32(); + if (Subtarget->hasFP32Denormals()) + return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts(); + + // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32. + return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts(); + } case MVT::f64: return true; case MVT::f16: @@ -3257,6 +3478,49 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { // Custom DAG Lowering Operations //===----------------------------------------------------------------------===// +// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the +// wider vector type is legal. +SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, + SelectionDAG &DAG) const { + unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); + assert(VT == MVT::v4f16); + + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0); + + SDLoc SL(Op); + SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, + Op->getFlags()); + SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, + Op->getFlags()); + + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); +} + +// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the +// wider vector type is legal. +SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, + SelectionDAG &DAG) const { + unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); + assert(VT == MVT::v4i16 || VT == MVT::v4f16); + + SDValue Lo0, Hi0; + std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); + SDValue Lo1, Hi1; + std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1); + + SDLoc SL(Op); + + SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, + Op->getFlags()); + SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, + Op->getFlags()); + + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); +} + SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); @@ -3289,15 +3553,105 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::BUILD_VECTOR: + return lowerBUILD_VECTOR(Op, DAG); case ISD::FP_ROUND: return lowerFP_ROUND(Op, DAG); case ISD::TRAP: - case ISD::DEBUGTRAP: return lowerTRAP(Op, DAG); + case ISD::DEBUGTRAP: + return lowerDEBUGTRAP(Op, DAG); + case ISD::FABS: + case ISD::FNEG: + return splitUnaryVectorOp(Op, DAG); + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: + case ISD::ADD: + case ISD::SUB: + case ISD::MUL: + case ISD::SMIN: + case ISD::SMAX: + case ISD::UMIN: + case ISD::UMAX: + case ISD::FMINNUM: + case ISD::FMAXNUM: + case ISD::FADD: + case ISD::FMUL: + return splitBinaryVectorOp(Op, DAG); } return SDValue(); } +static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, + const SDLoc &DL, + SelectionDAG &DAG, bool Unpacked) { + if (!LoadVT.isVector()) + return Result; + + if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16. + // Truncate to v2i16/v4i16. + EVT IntLoadVT = LoadVT.changeTypeToInteger(); + + // Workaround legalizer not scalarizing truncate after vector op + // legalization byt not creating intermediate vector trunc. + SmallVector<SDValue, 4> Elts; + DAG.ExtractVectorElements(Result, Elts); + for (SDValue &Elt : Elts) + Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt); + + Result = DAG.getBuildVector(IntLoadVT, DL, Elts); + + // Bitcast to original type (v2f16/v4f16). + return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result); + } + + // Cast back to the original packed type. + return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result); +} + +SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, + MemSDNode *M, + SelectionDAG &DAG, + bool IsIntrinsic) const { + SDLoc DL(M); + SmallVector<SDValue, 10> Ops; + Ops.reserve(M->getNumOperands()); + + Ops.push_back(M->getOperand(0)); + if (IsIntrinsic) + Ops.push_back(DAG.getConstant(Opcode, DL, MVT::i32)); + + // Skip 1, as it is the intrinsic ID. + for (unsigned I = 2, E = M->getNumOperands(); I != E; ++I) + Ops.push_back(M->getOperand(I)); + + bool Unpacked = Subtarget->hasUnpackedD16VMem(); + EVT LoadVT = M->getValueType(0); + + EVT EquivLoadVT = LoadVT; + if (Unpacked && LoadVT.isVector()) { + EquivLoadVT = LoadVT.isVector() ? + EVT::getVectorVT(*DAG.getContext(), MVT::i32, + LoadVT.getVectorNumElements()) : LoadVT; + } + + // Change from v4f16/v2f16 to EquivLoadVT. + SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other); + + SDValue Load + = DAG.getMemIntrinsicNode( + IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, + VTList, Ops, M->getMemoryVT(), + M->getMemOperand()); + if (!Unpacked) // Just adjusted the opcode. + return Load; + + SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked); + + return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL); +} + void SITargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { @@ -3314,7 +3668,8 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); - if (IID == Intrinsic::amdgcn_cvt_pkrtz) { + switch (IID) { + case Intrinsic::amdgcn_cvt_pkrtz: { SDValue Src0 = N->getOperand(1); SDValue Src1 = N->getOperand(2); SDLoc SL(N); @@ -3323,6 +3678,38 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt)); return; } + case Intrinsic::amdgcn_cvt_pknorm_i16: + case Intrinsic::amdgcn_cvt_pknorm_u16: + case Intrinsic::amdgcn_cvt_pk_i16: + case Intrinsic::amdgcn_cvt_pk_u16: { + SDValue Src0 = N->getOperand(1); + SDValue Src1 = N->getOperand(2); + SDLoc SL(N); + unsigned Opcode; + + if (IID == Intrinsic::amdgcn_cvt_pknorm_i16) + Opcode = AMDGPUISD::CVT_PKNORM_I16_F32; + else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16) + Opcode = AMDGPUISD::CVT_PKNORM_U16_F32; + else if (IID == Intrinsic::amdgcn_cvt_pk_i16) + Opcode = AMDGPUISD::CVT_PK_I16_I32; + else + Opcode = AMDGPUISD::CVT_PK_U16_U32; + + SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1); + Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt)); + return; + } + } + break; + } + case ISD::INTRINSIC_W_CHAIN: { + if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) { + Results.push_back(Res); + Results.push_back(Res.getValue(1)); + return; + } + break; } case ISD::SELECT: { @@ -3347,12 +3734,38 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect)); return; } + case ISD::FNEG: { + if (N->getValueType(0) != MVT::v2f16) + break; + + SDLoc SL(N); + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); + + SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, + BC, + DAG.getConstant(0x80008000, SL, MVT::i32)); + Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); + return; + } + case ISD::FABS: { + if (N->getValueType(0) != MVT::v2f16) + break; + + SDLoc SL(N); + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); + + SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, + BC, + DAG.getConstant(0x7fff7fff, SL, MVT::i32)); + Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); + return; + } default: break; } } -/// \brief Helper function for LowerBRCOND +/// Helper function for LowerBRCOND static SDNode *findUser(SDValue Value, unsigned Opcode) { SDNode *Parent = Value.getNode(); @@ -3417,13 +3830,15 @@ void SITargetLowering::createDebuggerPrologueStackObjects( bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { const Triple &TT = getTargetMachine().getTargetTriple(); - return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && + return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && AMDGPU::shouldEmitConstantsToTextSection(TT); } bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || - GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) && + GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); } @@ -3560,40 +3975,37 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); - MachineFunction &MF = DAG.getMachineFunction(); SDValue Chain = Op.getOperand(0); - unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ? - SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap; - - if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa && - Subtarget->isTrapHandlerEnabled()) { - SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - unsigned UserSGPR = Info->getQueuePtrUserSGPR(); - assert(UserSGPR != AMDGPU::NoRegister); - - SDValue QueuePtr = CreateLiveInRegister( - DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); - - SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64); - - SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, - QueuePtr, SDValue()); + if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || + !Subtarget->isTrapHandlerEnabled()) + return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain); - SDValue Ops[] = { - ToReg, - DAG.getTargetConstant(TrapID, SL, MVT::i16), - SGPR01, - ToReg.getValue(1) - }; + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + unsigned UserSGPR = Info->getQueuePtrUserSGPR(); + assert(UserSGPR != AMDGPU::NoRegister); + SDValue QueuePtr = CreateLiveInRegister( + DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); + SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64); + SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, + QueuePtr, SDValue()); + SDValue Ops[] = { + ToReg, + DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16), + SGPR01, + ToReg.getValue(1) + }; + return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); +} - return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); - } +SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Chain = Op.getOperand(0); + MachineFunction &MF = DAG.getMachineFunction(); - switch (TrapID) { - case SISubtarget::TrapIDLLVMTrap: - return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain); - case SISubtarget::TrapIDLLVMDebugTrap: { + if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || + !Subtarget->isTrapHandlerEnabled()) { DiagnosticInfoUnsupported NoTrap(MF.getFunction(), "debugtrap handler not supported", Op.getDebugLoc(), @@ -3602,11 +4014,12 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { Ctx.diagnose(NoTrap); return Chain; } - default: - llvm_unreachable("unsupported trap handler type!"); - } - return Chain; + SDValue Ops[] = { + Chain, + DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16) + }; + return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); } SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, @@ -3719,34 +4132,78 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { + SDValue Vec = Op.getOperand(0); + SDValue InsVal = Op.getOperand(1); SDValue Idx = Op.getOperand(2); + EVT VecVT = Vec.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + unsigned VecSize = VecVT.getSizeInBits(); + unsigned EltSize = EltVT.getSizeInBits(); + + + assert(VecSize <= 64); + + unsigned NumElts = VecVT.getVectorNumElements(); + SDLoc SL(Op); + auto KIdx = dyn_cast<ConstantSDNode>(Idx); + + if (NumElts == 4 && EltSize == 16 && KIdx) { + SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec); + + SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec, + DAG.getConstant(0, SL, MVT::i32)); + SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec, + DAG.getConstant(1, SL, MVT::i32)); + + SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf); + SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf); + + unsigned Idx = KIdx->getZExtValue(); + bool InsertLo = Idx < 2; + SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, + InsertLo ? LoVec : HiVec, + DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal), + DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32)); + + InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf); + + SDValue Concat = InsertLo ? + DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) : + DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf }); + + return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat); + } + if (isa<ConstantSDNode>(Idx)) return SDValue(); + MVT IntVT = MVT::getIntegerVT(VecSize); + // Avoid stack access for dynamic indexing. - SDLoc SL(Op); - SDValue Vec = Op.getOperand(0); - SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1)); + SDValue Val = InsVal; + if (InsVal.getValueType() == MVT::f16) + Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal); // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec - SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val); + SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val); - // Convert vector index to bit-index. - SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, - DAG.getConstant(16, SL, MVT::i32)); + assert(isPowerOf2_32(EltSize)); + SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); - SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); + // Convert vector index to bit-index. + SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor); - SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32, - DAG.getConstant(0xffff, SL, MVT::i32), + SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); + SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT, + DAG.getConstant(0xffff, SL, IntVT), ScaledIdx); - SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal); - SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32, - DAG.getNOT(SL, BFM, MVT::i32), BCVec); + SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal); + SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT, + DAG.getNOT(SL, BFM, IntVT), BCVec); - SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS); - return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI); + SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS); + return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI); } SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, @@ -3756,51 +4213,87 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, EVT ResultVT = Op.getValueType(); SDValue Vec = Op.getOperand(0); SDValue Idx = Op.getOperand(1); + EVT VecVT = Vec.getValueType(); + unsigned VecSize = VecVT.getSizeInBits(); + EVT EltVT = VecVT.getVectorElementType(); + assert(VecSize <= 64); DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); - // Make sure we we do any optimizations that will make it easier to fold + // Make sure we do any optimizations that will make it easier to fold // source modifiers before obscuring it with bit operations. // XXX - Why doesn't this get called when vector_shuffle is expanded? if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI)) return Combined; - if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) { - SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); + unsigned EltSize = EltVT.getSizeInBits(); + assert(isPowerOf2_32(EltSize)); - if (CIdx->getZExtValue() == 1) { - Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result, - DAG.getConstant(16, SL, MVT::i32)); - } else { - assert(CIdx->getZExtValue() == 0); - } + MVT IntVT = MVT::getIntegerVT(VecSize); + SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); + + // Convert vector index to bit-index (* EltSize) + SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor); - if (ResultVT.bitsLT(MVT::i32)) - Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result); + SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); + SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx); + + if (ResultVT == MVT::f16) { + SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt); return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result); } - SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32); + return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT); +} - // Convert vector index to bit-index. - SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen); +SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc SL(Op); + EVT VT = Op.getValueType(); + + if (VT == MVT::v4i16 || VT == MVT::v4f16) { + EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2); + + // Turn into pair of packed build_vectors. + // TODO: Special case for constants that can be materialized with s_mov_b64. + SDValue Lo = DAG.getBuildVector(HalfVT, SL, + { Op.getOperand(0), Op.getOperand(1) }); + SDValue Hi = DAG.getBuildVector(HalfVT, SL, + { Op.getOperand(2), Op.getOperand(3) }); + + SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo); + SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi); + + SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi }); + return DAG.getNode(ISD::BITCAST, SL, VT, Blend); + } - SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); - SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx); + assert(VT == MVT::v2f16 || VT == MVT::v2i16); - SDValue Result = Elt; - if (ResultVT.bitsLT(MVT::i32)) - Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result); + SDValue Lo = Op.getOperand(0); + SDValue Hi = Op.getOperand(1); - return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result); + Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); + Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi); + + Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo); + Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi); + + SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi, + DAG.getConstant(16, SL, MVT::i32)); + + SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi); + + return DAG.getNode(ISD::BITCAST, SL, VT, Or); } bool SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // We can fold offsets for anything that doesn't require a GOT relocation. return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || - GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) && + GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && !shouldEmitGOTReloc(GA->getGlobal()); } @@ -3853,6 +4346,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, const GlobalValue *GV = GSD->getGlobal(); if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS && + GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT && GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS && // FIXME: It isn't correct to rely on the type of the pointer. This should // be removed when address space 0 is 64-bit. @@ -3905,7 +4399,7 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, unsigned Offset) const { SDLoc SL(Op); SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL, - DAG.getEntryNode(), Offset, false); + DAG.getEntryNode(), Offset, 4, false); // The local size values will have the hi 16-bits as zero. return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, DAG.getValueType(VT)); @@ -3929,6 +4423,245 @@ static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, return DAG.getUNDEF(VT); } +static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, + ArrayRef<SDValue> Elts) { + assert(!Elts.empty()); + MVT Type; + unsigned NumElts; + + if (Elts.size() == 1) { + Type = MVT::f32; + NumElts = 1; + } else if (Elts.size() == 2) { + Type = MVT::v2f32; + NumElts = 2; + } else if (Elts.size() <= 4) { + Type = MVT::v4f32; + NumElts = 4; + } else if (Elts.size() <= 8) { + Type = MVT::v8f32; + NumElts = 8; + } else { + assert(Elts.size() <= 16); + Type = MVT::v16f32; + NumElts = 16; + } + + SmallVector<SDValue, 16> VecElts(NumElts); + for (unsigned i = 0; i < Elts.size(); ++i) { + SDValue Elt = Elts[i]; + if (Elt.getValueType() != MVT::f32) + Elt = DAG.getBitcast(MVT::f32, Elt); + VecElts[i] = Elt; + } + for (unsigned i = Elts.size(); i < NumElts; ++i) + VecElts[i] = DAG.getUNDEF(MVT::f32); + + if (NumElts == 1) + return VecElts[0]; + return DAG.getBuildVector(Type, DL, VecElts); +} + +static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG, + SDValue *GLC, SDValue *SLC) { + auto CachePolicyConst = dyn_cast<ConstantSDNode>(CachePolicy.getNode()); + if (!CachePolicyConst) + return false; + + uint64_t Value = CachePolicyConst->getZExtValue(); + SDLoc DL(CachePolicy); + if (GLC) { + *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32); + Value &= ~(uint64_t)0x1; + } + if (SLC) { + *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32); + Value &= ~(uint64_t)0x2; + } + + return Value == 0; +} + +SDValue SITargetLowering::lowerImage(SDValue Op, + const AMDGPU::ImageDimIntrinsicInfo *Intr, + SelectionDAG &DAG) const { + SDLoc DL(Op); + MachineFunction &MF = DAG.getMachineFunction(); + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = + AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); + const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); + + SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end()); + bool IsD16 = false; + SDValue VData; + int NumVDataDwords; + unsigned AddrIdx; // Index of first address argument + unsigned DMask; + + if (BaseOpcode->Atomic) { + VData = Op.getOperand(2); + + bool Is64Bit = VData.getValueType() == MVT::i64; + if (BaseOpcode->AtomicX2) { + SDValue VData2 = Op.getOperand(3); + VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL, + {VData, VData2}); + if (Is64Bit) + VData = DAG.getBitcast(MVT::v4i32, VData); + + ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; + DMask = Is64Bit ? 0xf : 0x3; + NumVDataDwords = Is64Bit ? 4 : 2; + AddrIdx = 4; + } else { + DMask = Is64Bit ? 0x3 : 0x1; + NumVDataDwords = Is64Bit ? 2 : 1; + AddrIdx = 3; + } + } else { + unsigned DMaskIdx; + + if (BaseOpcode->Store) { + VData = Op.getOperand(2); + + MVT StoreVT = VData.getSimpleValueType(); + if (StoreVT.getScalarType() == MVT::f16) { + if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS || + !BaseOpcode->HasD16) + return Op; // D16 is unsupported for this instruction + + IsD16 = true; + VData = handleD16VData(VData, DAG); + } + + NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32; + DMaskIdx = 3; + } else { + MVT LoadVT = Op.getSimpleValueType(); + if (LoadVT.getScalarType() == MVT::f16) { + if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS || + !BaseOpcode->HasD16) + return Op; // D16 is unsupported for this instruction + + IsD16 = true; + if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem()) + ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; + } + + NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32; + DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1; + } + + auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx)); + if (!DMaskConst) + return Op; + + AddrIdx = DMaskIdx + 1; + DMask = DMaskConst->getZExtValue(); + if (!DMask && !BaseOpcode->Store) { + // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they + // store the channels' default values. + SDValue Undef = DAG.getUNDEF(Op.getValueType()); + if (isa<MemSDNode>(Op)) + return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL); + return Undef; + } + } + + unsigned NumVAddrs = BaseOpcode->NumExtraArgs + + (BaseOpcode->Gradients ? DimInfo->NumGradients : 0) + + (BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) + + (BaseOpcode->LodOrClampOrMip ? 1 : 0); + SmallVector<SDValue, 4> VAddrs; + for (unsigned i = 0; i < NumVAddrs; ++i) + VAddrs.push_back(Op.getOperand(AddrIdx + i)); + SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs); + + SDValue True = DAG.getTargetConstant(1, DL, MVT::i1); + SDValue False = DAG.getTargetConstant(0, DL, MVT::i1); + unsigned CtrlIdx; // Index of texfailctrl argument + SDValue Unorm; + if (!BaseOpcode->Sampler) { + Unorm = True; + CtrlIdx = AddrIdx + NumVAddrs + 1; + } else { + auto UnormConst = + dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2)); + if (!UnormConst) + return Op; + + Unorm = UnormConst->getZExtValue() ? True : False; + CtrlIdx = AddrIdx + NumVAddrs + 3; + } + + SDValue TexFail = Op.getOperand(CtrlIdx); + auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode()); + if (!TexFailConst || TexFailConst->getZExtValue() != 0) + return Op; + + SDValue GLC; + SDValue SLC; + if (BaseOpcode->Atomic) { + GLC = True; // TODO no-return optimization + if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC)) + return Op; + } else { + if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC)) + return Op; + } + + SmallVector<SDValue, 14> Ops; + if (BaseOpcode->Store || BaseOpcode->Atomic) + Ops.push_back(VData); // vdata + Ops.push_back(VAddr); + Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc + if (BaseOpcode->Sampler) + Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler + Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32)); + Ops.push_back(Unorm); + Ops.push_back(GLC); + Ops.push_back(SLC); + Ops.push_back(False); // r128 + Ops.push_back(False); // tfe + Ops.push_back(False); // lwe + Ops.push_back(DimInfo->DA ? True : False); + if (BaseOpcode->HasD16) + Ops.push_back(IsD16 ? True : False); + if (isa<MemSDNode>(Op)) + Ops.push_back(Op.getOperand(0)); // chain + + int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32; + int Opcode = -1; + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx8, + NumVDataDwords, NumVAddrDwords); + if (Opcode == -1) + Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx6, + NumVDataDwords, NumVAddrDwords); + assert(Opcode != -1); + + MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops); + if (auto MemOp = dyn_cast<MemSDNode>(Op)) { + MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1); + *MemRefs = MemOp->getMemOperand(); + NewNode->setMemRefs(MemRefs, MemRefs + 1); + } + + if (BaseOpcode->AtomicX2) { + SmallVector<SDValue, 1> Elt; + DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1); + return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL); + } else if (IsD16 && !BaseOpcode->Store) { + MVT LoadVT = Op.getSimpleValueType(); + SDValue Adjusted = adjustLoadValueTypeImpl( + SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem()); + return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL); + } + + return SDValue(NewNode, 0); +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -3942,14 +4675,14 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (IntrinsicID) { case Intrinsic::amdgcn_implicit_buffer_ptr: { - if (getSubtarget()->isAmdCodeObjectV2(MF)) + if (getSubtarget()->isAmdCodeObjectV2(MF.getFunction())) return emitNonHSAIntrinsicError(DAG, DL, VT); return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); } case Intrinsic::amdgcn_dispatch_ptr: case Intrinsic::amdgcn_queue_ptr: { - if (!Subtarget->isAmdCodeObjectV2(MF)) { + if (!Subtarget->isAmdCodeObjectV2(MF.getFunction())) { DiagnosticInfoUnsupported BadIntrin( MF.getFunction(), "unsupported hsa intrinsic without hsa target", DL.getDebugLoc()); @@ -3979,16 +4712,16 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_rsq: return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); case Intrinsic::amdgcn_rsq_legacy: - if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return emitRemovedIntrinsicError(DAG, DL, VT); return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); case Intrinsic::amdgcn_rcp_legacy: - if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return emitRemovedIntrinsicError(DAG, DL, VT); return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1)); case Intrinsic::amdgcn_rsq_clamp: { - if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) + if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); Type *Type = VT.getTypeForEVT(*DAG.getContext()); @@ -4006,37 +4739,37 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_X, false); + SI::KernelInputOffsets::NGROUPS_X, 4, false); case Intrinsic::r600_read_ngroups_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Y, false); + SI::KernelInputOffsets::NGROUPS_Y, 4, false); case Intrinsic::r600_read_ngroups_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Z, false); + SI::KernelInputOffsets::NGROUPS_Z, 4, false); case Intrinsic::r600_read_global_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_X, false); + SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false); case Intrinsic::r600_read_global_size_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); + SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false); case Intrinsic::r600_read_global_size_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); + SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false); case Intrinsic::r600_read_local_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); @@ -4125,7 +4858,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1)); case Intrinsic::amdgcn_log_clamp: { - if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) + if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) return SDValue(); DiagnosticInfoUnsupported BadIntrin( @@ -4210,6 +4943,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_fmed3: return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::amdgcn_fdot2: + return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_fmul_legacy: return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -4221,10 +4957,27 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_ubfe: return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - case Intrinsic::amdgcn_cvt_pkrtz: { - // FIXME: Stop adding cast if v2f16 legal. + case Intrinsic::amdgcn_cvt_pkrtz: + case Intrinsic::amdgcn_cvt_pknorm_i16: + case Intrinsic::amdgcn_cvt_pknorm_u16: + case Intrinsic::amdgcn_cvt_pk_i16: + case Intrinsic::amdgcn_cvt_pk_u16: { + // FIXME: Stop adding cast if v2f16/v2i16 are legal. EVT VT = Op.getValueType(); - SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32, + unsigned Opcode; + + if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz) + Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32; + else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16) + Opcode = AMDGPUISD::CVT_PKNORM_I16_F32; + else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16) + Opcode = AMDGPUISD::CVT_PKNORM_U16_F32; + else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16) + Opcode = AMDGPUISD::CVT_PK_I16_I32; + else + Opcode = AMDGPUISD::CVT_PK_U16_U32; + + SDValue Node = DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2)); return DAG.getNode(ISD::BITCAST, DL, VT, Node); } @@ -4238,17 +4991,14 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src), 0); } - case Intrinsic::amdgcn_image_getlod: - case Intrinsic::amdgcn_image_getresinfo: { - unsigned Idx = (IntrinsicID == Intrinsic::amdgcn_image_getresinfo) ? 3 : 4; - - // Replace dmask with everything disabled with undef. - const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(Idx)); - if (!DMask || DMask->isNullValue()) - return DAG.getUNDEF(Op.getValueType()); - return SDValue(); - } + case Intrinsic::amdgcn_fmad_ftz: + return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); default: + if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = + AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) + return lowerImage(Op, ImageDimIntr, DAG); + return Op; } } @@ -4257,14 +5007,34 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); SDLoc DL(Op); - MachineFunction &MF = DAG.getMachineFunction(); switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: { + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: { MemSDNode *M = cast<MemSDNode>(Op); - unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ? - AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC; + unsigned Opc; + switch (IntrID) { + case Intrinsic::amdgcn_atomic_inc: + Opc = AMDGPUISD::ATOMIC_INC; + break; + case Intrinsic::amdgcn_atomic_dec: + Opc = AMDGPUISD::ATOMIC_DEC; + break; + case Intrinsic::amdgcn_ds_fadd: + Opc = AMDGPUISD::ATOMIC_LOAD_FADD; + break; + case Intrinsic::amdgcn_ds_fmin: + Opc = AMDGPUISD::ATOMIC_LOAD_FMIN; + break; + case Intrinsic::amdgcn_ds_fmax: + Opc = AMDGPUISD::ATOMIC_LOAD_FMAX; + break; + default: + llvm_unreachable("Unknown intrinsic!"); + } SDValue Ops[] = { M->getOperand(0), // Chain M->getOperand(2), // Ptr @@ -4284,21 +5054,28 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(5), // glc Op.getOperand(6) // slc }; - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; EVT VT = Op.getValueType(); EVT IntVT = VT.changeTypeToInteger(); + auto *M = cast<MemSDNode>(Op); + EVT LoadVT = Op.getValueType(); + bool IsD16 = LoadVT.getScalarType() == MVT::f16; + if (IsD16) + return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(MFI->getBufferPSV()), - MachineMemOperand::MOLoad, - VT.getStoreSize(), VT.getStoreSize()); - - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, + M->getMemOperand()); } case Intrinsic::amdgcn_tbuffer_load: { + MemSDNode *M = cast<MemSDNode>(Op); + EVT LoadVT = Op.getValueType(); + bool IsD16 = LoadVT.getScalarType() == MVT::f16; + if (IsD16) { + return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG); + } + SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc @@ -4312,14 +5089,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(10) // slc }; - EVT VT = Op.getOperand(2).getValueType(); - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad, - VT.getStoreSize(), VT.getStoreSize()); return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, VT, MMO); + Op->getVTList(), Ops, LoadVT, + M->getMemOperand()); } case Intrinsic::amdgcn_buffer_atomic_swap: case Intrinsic::amdgcn_buffer_atomic_add: @@ -4339,14 +5111,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(5), // offset Op.getOperand(6) // slc }; - EVT VT = Op.getOperand(3).getValueType(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad | - MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOVolatile, - VT.getStoreSize(), 4); + EVT VT = Op.getValueType(); + + auto *M = cast<MemSDNode>(Op); unsigned Opcode = 0; switch (IntrID) { @@ -4384,7 +5151,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, llvm_unreachable("unhandled atomic opcode"); } - return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO); + return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, + M->getMemOperand()); } case Intrinsic::amdgcn_buffer_atomic_cmpswap: { @@ -4397,78 +5165,46 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(6), // offset Op.getOperand(7) // slc }; - EVT VT = Op.getOperand(4).getValueType(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad | - MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOVolatile, - VT.getStoreSize(), 4); + EVT VT = Op.getValueType(); + auto *M = cast<MemSDNode>(Op); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, - Op->getVTList(), Ops, VT, MMO); + Op->getVTList(), Ops, VT, M->getMemOperand()); } - // Basic sample. - case Intrinsic::amdgcn_image_sample: - case Intrinsic::amdgcn_image_sample_cl: - case Intrinsic::amdgcn_image_sample_d: - case Intrinsic::amdgcn_image_sample_d_cl: - case Intrinsic::amdgcn_image_sample_l: - case Intrinsic::amdgcn_image_sample_b: - case Intrinsic::amdgcn_image_sample_b_cl: - case Intrinsic::amdgcn_image_sample_lz: - case Intrinsic::amdgcn_image_sample_cd: - case Intrinsic::amdgcn_image_sample_cd_cl: - - // Sample with comparison. - case Intrinsic::amdgcn_image_sample_c: - case Intrinsic::amdgcn_image_sample_c_cl: - case Intrinsic::amdgcn_image_sample_c_d: - case Intrinsic::amdgcn_image_sample_c_d_cl: - case Intrinsic::amdgcn_image_sample_c_l: - case Intrinsic::amdgcn_image_sample_c_b: - case Intrinsic::amdgcn_image_sample_c_b_cl: - case Intrinsic::amdgcn_image_sample_c_lz: - case Intrinsic::amdgcn_image_sample_c_cd: - case Intrinsic::amdgcn_image_sample_c_cd_cl: - - // Sample with offsets. - case Intrinsic::amdgcn_image_sample_o: - case Intrinsic::amdgcn_image_sample_cl_o: - case Intrinsic::amdgcn_image_sample_d_o: - case Intrinsic::amdgcn_image_sample_d_cl_o: - case Intrinsic::amdgcn_image_sample_l_o: - case Intrinsic::amdgcn_image_sample_b_o: - case Intrinsic::amdgcn_image_sample_b_cl_o: - case Intrinsic::amdgcn_image_sample_lz_o: - case Intrinsic::amdgcn_image_sample_cd_o: - case Intrinsic::amdgcn_image_sample_cd_cl_o: - - // Sample with comparison and offsets. - case Intrinsic::amdgcn_image_sample_c_o: - case Intrinsic::amdgcn_image_sample_c_cl_o: - case Intrinsic::amdgcn_image_sample_c_d_o: - case Intrinsic::amdgcn_image_sample_c_d_cl_o: - case Intrinsic::amdgcn_image_sample_c_l_o: - case Intrinsic::amdgcn_image_sample_c_b_o: - case Intrinsic::amdgcn_image_sample_c_b_cl_o: - case Intrinsic::amdgcn_image_sample_c_lz_o: - case Intrinsic::amdgcn_image_sample_c_cd_o: - case Intrinsic::amdgcn_image_sample_c_cd_cl_o: { - // Replace dmask with everything disabled with undef. - const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5)); - if (!DMask || DMask->isNullValue()) { - SDValue Undef = DAG.getUNDEF(Op.getValueType()); - return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op)); - } + default: + if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = + AMDGPU::getImageDimIntrinsicInfo(IntrID)) + return lowerImage(Op, ImageDimIntr, DAG); return SDValue(); } - default: - return SDValue(); +} + +SDValue SITargetLowering::handleD16VData(SDValue VData, + SelectionDAG &DAG) const { + EVT StoreVT = VData.getValueType(); + + // No change for f16 and legal vector D16 types. + if (!StoreVT.isVector()) + return VData; + + SDLoc DL(VData); + assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16"); + + if (Subtarget->hasUnpackedD16VMem()) { + // We need to unpack the packed data to store. + EVT IntStoreVT = StoreVT.changeTypeToInteger(); + SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); + + EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + StoreVT.getVectorNumElements()); + SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); + return DAG.UnrollVectorOp(ZExt.getNode()); } + + assert(isTypeLegal(StoreVT)); + return VData; } SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, @@ -4558,7 +5294,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, } case Intrinsic::amdgcn_s_barrier: { if (getTargetMachine().getOptLevel() > CodeGenOpt::None) { - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second; if (WGSize <= ST.getWavefrontSize()) return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other, @@ -4613,9 +5349,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, } case Intrinsic::amdgcn_tbuffer_store: { + SDValue VData = Op.getOperand(2); + bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + if (IsD16) + VData = handleD16VData(VData, DAG); SDValue Ops[] = { Chain, - Op.getOperand(2), // vdata + VData, // vdata Op.getOperand(3), // rsrc Op.getOperand(4), // vindex Op.getOperand(5), // voffset @@ -4626,42 +5366,133 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(10), // glc Op.getOperand(11) // slc }; - EVT VT = Op.getOperand(3).getValueType(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOStore, - VT.getStoreSize(), 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, - Op->getVTList(), Ops, VT, MMO); + unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : + AMDGPUISD::TBUFFER_STORE_FORMAT; + MemSDNode *M = cast<MemSDNode>(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); } case Intrinsic::amdgcn_buffer_store: case Intrinsic::amdgcn_buffer_store_format: { + SDValue VData = Op.getOperand(2); + bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + if (IsD16) + VData = handleD16VData(VData, DAG); SDValue Ops[] = { Chain, - Op.getOperand(2), // vdata + VData, // vdata Op.getOperand(3), // rsrc Op.getOperand(4), // vindex Op.getOperand(5), // offset Op.getOperand(6), // glc Op.getOperand(7) // slc }; - EVT VT = Op.getOperand(3).getValueType(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable, - VT.getStoreSize(), 4); + unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? + AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; + Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; + MemSDNode *M = cast<MemSDNode>(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } + default: { + if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = + AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) + return lowerImage(Op, ImageDimIntr, DAG); - unsigned Opcode = IntrinsicID == Intrinsic::amdgcn_buffer_store ? - AMDGPUISD::BUFFER_STORE : - AMDGPUISD::BUFFER_STORE_FORMAT; - return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO); + return Op; } + } +} - default: +static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, + ISD::LoadExtType ExtType, SDValue Op, + const SDLoc &SL, EVT VT) { + if (VT.bitsLT(Op.getValueType())) + return DAG.getNode(ISD::TRUNCATE, SL, VT, Op); + + switch (ExtType) { + case ISD::SEXTLOAD: + return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op); + case ISD::ZEXTLOAD: + return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op); + case ISD::EXTLOAD: + return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op); + case ISD::NON_EXTLOAD: return Op; } + + llvm_unreachable("invalid ext type"); +} + +SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + if (Ld->getAlignment() < 4 || Ld->isDivergent()) + return SDValue(); + + // FIXME: Constant loads should all be marked invariant. + unsigned AS = Ld->getAddressSpace(); + if (AS != AMDGPUASI.CONSTANT_ADDRESS && + AS != AMDGPUASI.CONSTANT_ADDRESS_32BIT && + (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant())) + return SDValue(); + + // Don't do this early, since it may interfere with adjacent load merging for + // illegal types. We can avoid losing alignment information for exotic types + // pre-legalize. + EVT MemVT = Ld->getMemoryVT(); + if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) || + MemVT.getSizeInBits() >= 32) + return SDValue(); + + SDLoc SL(Ld); + + assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) && + "unexpected vector extload"); + + // TODO: Drop only high part of range. + SDValue Ptr = Ld->getBasePtr(); + SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, + MVT::i32, SL, Ld->getChain(), Ptr, + Ld->getOffset(), + Ld->getPointerInfo(), MVT::i32, + Ld->getAlignment(), + Ld->getMemOperand()->getFlags(), + Ld->getAAInfo(), + nullptr); // Drop ranges + + EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); + if (MemVT.isFloatingPoint()) { + assert(Ld->getExtensionType() == ISD::NON_EXTLOAD && + "unexpected fp extload"); + TruncVT = MemVT.changeTypeToInteger(); + } + + SDValue Cvt = NewLoad; + if (Ld->getExtensionType() == ISD::SEXTLOAD) { + Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad, + DAG.getValueType(TruncVT)); + } else if (Ld->getExtensionType() == ISD::ZEXTLOAD || + Ld->getExtensionType() == ISD::NON_EXTLOAD) { + Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT); + } else { + assert(Ld->getExtensionType() == ISD::EXTLOAD); + } + + EVT VT = Ld->getValueType(0); + EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); + + DCI.AddToWorklist(Cvt.getNode()); + + // We may need to handle exotic cases, such as i16->i64 extloads, so insert + // the appropriate extension from the 32-bit load. + Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT); + DCI.AddToWorklist(Cvt.getNode()); + + // Handle conversion back to floating point if necessary. + Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt); + + return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL); } SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { @@ -4700,9 +5531,10 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType().getVectorElementType() == MVT::i32 && "Custom lowering for non-i32 vectors hasn't been implemented."); + unsigned Alignment = Load->getAlignment(); unsigned AS = Load->getAddressSpace(); if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, - AS, Load->getAlignment())) { + AS, Alignment)) { SDValue Ops[2]; std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); return DAG.getMergeValues(Ops, DL); @@ -4717,24 +5549,32 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS; unsigned NumElements = MemVT.getVectorNumElements(); - if (AS == AMDGPUASI.CONSTANT_ADDRESS) { - if (isMemOpUniform(Load)) + + if (AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) { + if (!Op->isDivergent() && Alignment >= 4) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private // loads. // } - if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) { - if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) && - !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load)) + + if (AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT || + AS == AMDGPUASI.GLOBAL_ADDRESS) { + if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && + !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) && + Alignment >= 4) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private // loads. // } - if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS || + if (AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT || + AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorLoad(Op, DAG); @@ -4761,21 +5601,20 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("unsupported private_element_size"); } } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { - if (NumElements > 2) - return SplitVectorLoad(Op, DAG); - - if (NumElements == 2) + // Use ds_read_b128 if possible. + if (Subtarget->useDS128() && Load->getAlignment() >= 16 && + MemVT.getStoreSize() == 16) return SDValue(); - // If properly aligned, if we split we might be able to use ds_read_b64. - return SplitVectorLoad(Op, DAG); + if (NumElements > 2) + return SplitVectorLoad(Op, DAG); } return SDValue(); } SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType() != MVT::i64) - return SDValue(); + EVT VT = Op.getValueType(); + assert(VT.getSizeInBits() == 64); SDLoc DL(Op); SDValue Cond = Op.getOperand(0); @@ -4797,7 +5636,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi}); - return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); + return DAG.getNode(ISD::BITCAST, DL, VT, Res); } // Catch division cases where we can use shortcuts with rcp and rsq @@ -4809,8 +5648,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, SDValue RHS = Op.getOperand(1); EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || - Flags.hasUnsafeAlgebra() || Flags.hasAllowReciprocal(); + bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal(); if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals()) return SDValue(); @@ -5067,7 +5905,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { SDValue Scale; - if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { // Workaround a hardware bug on SI where the condition output from div_scale // is not usable. @@ -5165,14 +6003,14 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("unsupported private_element_size"); } } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { + // Use ds_write_b128 if possible. + if (Subtarget->useDS128() && Store->getAlignment() >= 16 && + VT.getStoreSize() == 16) + return SDValue(); + if (NumElements > 2) return SplitVectorStore(Op, DAG); - - if (NumElements == 2) - return Op; - - // If properly aligned, if we split we might be able to use ds_write_b64. - return SplitVectorStore(Op, DAG); + return SDValue(); } else { llvm_unreachable("unhandled address space"); } @@ -5246,7 +6084,7 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, // easier if i8 vectors weren't promoted to i32 vectors, particularly after // types are legalized. v4i8 -> v4f32 is probably the only case to worry // about in practice. - if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { + if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) { if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); DCI.AddToWorklist(Cvt.getNode()); @@ -5389,6 +6227,71 @@ static bool isBoolSGPR(SDValue V) { return false; } +// If a constant has all zeroes or all ones within each byte return it. +// Otherwise return 0. +static uint32_t getConstantPermuteMask(uint32_t C) { + // 0xff for any zero byte in the mask + uint32_t ZeroByteMask = 0; + if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff; + if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00; + if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000; + if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000; + uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte + if ((NonZeroByteMask & C) != NonZeroByteMask) + return 0; // Partial bytes selected. + return C; +} + +// Check if a node selects whole bytes from its operand 0 starting at a byte +// boundary while masking the rest. Returns select mask as in the v_perm_b32 +// or -1 if not succeeded. +// Note byte select encoding: +// value 0-3 selects corresponding source byte; +// value 0xc selects zero; +// value 0xff selects 0xff. +static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) { + assert(V.getValueSizeInBits() == 32); + + if (V.getNumOperands() != 2) + return ~0; + + ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1)); + if (!N1) + return ~0; + + uint32_t C = N1->getZExtValue(); + + switch (V.getOpcode()) { + default: + break; + case ISD::AND: + if (uint32_t ConstMask = getConstantPermuteMask(C)) { + return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask); + } + break; + + case ISD::OR: + if (uint32_t ConstMask = getConstantPermuteMask(C)) { + return (0x03020100 & ~ConstMask) | ConstMask; + } + break; + + case ISD::SHL: + if (C % 8) + return ~0; + + return uint32_t((0x030201000c0c0c0cull << C) >> 32); + + case ISD::SRL: + if (C % 8) + return ~0; + + return uint32_t(0x0c0c0c0c03020100ull >> C); + } + + return ~0; +} + SDValue SITargetLowering::performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (DCI.isBeforeLegalize()) @@ -5435,6 +6338,20 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, } } } + + // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2) + if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM && + isa<ConstantSDNode>(LHS.getOperand(2))) { + uint32_t Sel = getConstantPermuteMask(Mask); + if (!Sel) + return SDValue(); + + // Select 0xc for all zero bytes + Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c); + SDLoc DL(N); + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), + LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32)); + } } // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> @@ -5487,6 +6404,54 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, LHS, DAG.getConstant(0, SDLoc(N), MVT::i32)); } + // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && + N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) { + uint32_t LHSMask = getPermuteMask(DAG, LHS); + uint32_t RHSMask = getPermuteMask(DAG, RHS); + if (LHSMask != ~0u && RHSMask != ~0u) { + // Canonicalize the expression in an attempt to have fewer unique masks + // and therefore fewer registers used to hold the masks. + if (LHSMask > RHSMask) { + std::swap(LHSMask, RHSMask); + std::swap(LHS, RHS); + } + + // Select 0xc for each lane used from source operand. Zero has 0xc mask + // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range. + uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; + uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; + + // Check of we need to combine values from two sources within a byte. + if (!(LHSUsedLanes & RHSUsedLanes) && + // If we select high and lower word keep it for SDWA. + // TODO: teach SDWA to work with v_perm_b32 and remove the check. + !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) { + // Each byte in each mask is either selector mask 0-3, or has higher + // bits set in either of masks, which can be 0xff for 0xff or 0x0c for + // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise + // mask which is not 0xff wins. By anding both masks we have a correct + // result except that 0x0c shall be corrected to give 0x0c only. + uint32_t Mask = LHSMask & RHSMask; + for (unsigned I = 0; I < 32; I += 8) { + uint32_t ByteSel = 0xff << I; + if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c) + Mask &= (0x0c << I) & 0xffffffff; + } + + // Add 4 to each active LHS lane. It will not affect any existing 0xff + // or 0x0c. + uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404); + SDLoc DL(N); + + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, + LHS.getOperand(0), RHS.getOperand(0), + DAG.getConstant(Sel, DL, MVT::i32)); + } + } + } + return SDValue(); } @@ -5522,6 +6487,60 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, return SDValue(); } + // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2) + if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() && + LHS.getOpcode() == AMDGPUISD::PERM && + isa<ConstantSDNode>(LHS.getOperand(2))) { + uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1)); + if (!Sel) + return SDValue(); + + Sel |= LHS.getConstantOperandVal(2); + SDLoc DL(N); + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0), + LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32)); + } + + // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && + N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) { + uint32_t LHSMask = getPermuteMask(DAG, LHS); + uint32_t RHSMask = getPermuteMask(DAG, RHS); + if (LHSMask != ~0u && RHSMask != ~0u) { + // Canonicalize the expression in an attempt to have fewer unique masks + // and therefore fewer registers used to hold the masks. + if (LHSMask > RHSMask) { + std::swap(LHSMask, RHSMask); + std::swap(LHS, RHS); + } + + // Select 0xc for each lane used from source operand. Zero has 0xc mask + // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range. + uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; + uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c; + + // Check of we need to combine values from two sources within a byte. + if (!(LHSUsedLanes & RHSUsedLanes) && + // If we select high and lower word keep it for SDWA. + // TODO: teach SDWA to work with v_perm_b32 and remove the check. + !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) { + // Kill zero bytes selected by other mask. Zero value is 0xc. + LHSMask &= ~RHSUsedLanes; + RHSMask &= ~LHSUsedLanes; + // Add 4 to each active LHS lane + LHSMask |= LHSUsedLanes & 0x04040404; + // Combine masks + uint32_t Sel = LHSMask | RHSMask; + SDLoc DL(N); + + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, + LHS.getOperand(0), RHS.getOperand(0), + DAG.getConstant(Sel, DL, MVT::i32)); + } + } + } + if (VT != MVT::i64) return SDValue(); @@ -5628,6 +6647,7 @@ static bool fp16SrcZerosHighBits(unsigned Opc) { case AMDGPUISD::FMAD_FTZ: case AMDGPUISD::RCP: case AMDGPUISD::RSQ: + case AMDGPUISD::RCP_IFLAG: case AMDGPUISD::LDEXP: return true; default: @@ -5680,6 +6700,23 @@ SDValue SITargetLowering::performClassCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performRcpCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + + if (N0.isUndef()) + return N0; + + if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP || + N0.getOpcode() == ISD::SINT_TO_FP)) { + return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0, + N->getFlags()); + } + + return AMDGPUTargetLowering::performRcpCombine(N, DCI); +} + static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) return true; @@ -5688,7 +6725,7 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { } static bool isCanonicalized(SelectionDAG &DAG, SDValue Op, - const SISubtarget *ST, unsigned MaxDepth=5) { + const GCNSubtarget *ST, unsigned MaxDepth=5) { // If source is a result of another standard FP operation it is already in // canonical form. @@ -5946,7 +6983,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY && - VT != MVT::f64 && + !VT.isVector() && VT != MVT::f64 && ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) { // max(max(a, b), c) -> max3(a, b, c) // min(min(a, b), c) -> min3(a, b, c) @@ -6066,15 +7103,87 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N, SDValue SITargetLowering::performExtractVectorEltCombine( SDNode *N, DAGCombinerInfo &DCI) const { SDValue Vec = N->getOperand(0); - SelectionDAG &DAG = DCI.DAG; - if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) { + + EVT VecVT = Vec.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + + if ((Vec.getOpcode() == ISD::FNEG || + Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) { SDLoc SL(N); EVT EltVT = N->getValueType(0); SDValue Idx = N->getOperand(1); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec.getOperand(0), Idx); - return DAG.getNode(ISD::FNEG, SL, EltVT, Elt); + return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt); + } + + // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx) + // => + // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx) + // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx) + // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt + if (Vec.hasOneUse() && DCI.isBeforeLegalize()) { + SDLoc SL(N); + EVT EltVT = N->getValueType(0); + SDValue Idx = N->getOperand(1); + unsigned Opc = Vec.getOpcode(); + + switch(Opc) { + default: + return SDValue(); + // TODO: Support other binary operations. + case ISD::FADD: + case ISD::ADD: + case ISD::UMIN: + case ISD::UMAX: + case ISD::SMIN: + case ISD::SMAX: + case ISD::FMAXNUM: + case ISD::FMINNUM: + return DAG.getNode(Opc, SL, EltVT, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec.getOperand(0), Idx), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec.getOperand(1), Idx)); + } + } + + if (!DCI.isBeforeLegalize()) + return SDValue(); + + unsigned VecSize = VecVT.getSizeInBits(); + unsigned EltSize = EltVT.getSizeInBits(); + + // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit + // elements. This exposes more load reduction opportunities by replacing + // multiple small extract_vector_elements with a single 32-bit extract. + auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (EltSize <= 16 && + EltVT.isByteSized() && + VecSize > 32 && + VecSize % 32 == 0 && + Idx) { + EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT); + + unsigned BitIndex = Idx->getZExtValue() * EltSize; + unsigned EltIdx = BitIndex / 32; + unsigned LeftoverBitIdx = BitIndex % 32; + SDLoc SL(N); + + SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec); + DCI.AddToWorklist(Cast.getNode()); + + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast, + DAG.getConstant(EltIdx, SL, MVT::i32)); + DCI.AddToWorklist(Elt.getNode()); + SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt, + DAG.getConstant(LeftoverBitIdx, SL, MVT::i32)); + DCI.AddToWorklist(Srl.getNode()); + + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl); + DCI.AddToWorklist(Trunc.getNode()); + return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc); } return SDValue(); @@ -6135,8 +7244,8 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, const TargetOptions &Options = DAG.getTarget().Options; if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || - (N0->getFlags().hasUnsafeAlgebra() && - N1->getFlags().hasUnsafeAlgebra())) && + (N0->getFlags().hasAllowContract() && + N1->getFlags().hasAllowContract())) && isFMAFasterThanFMulAndFAdd(VT)) { return ISD::FMA; } @@ -6192,7 +7301,7 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, return SDValue(); } - if (VT != MVT::i32) + if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG()) return SDValue(); // add x, zext (setcc) => addcarry x, 0, setcc @@ -6368,6 +7477,79 @@ SDValue SITargetLowering::performFSubCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performFMACombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + SDLoc SL(N); + + if (!Subtarget->hasDLInsts() || VT != MVT::f32) + return SDValue(); + + // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) -> + // FDOT2((V2F16)S0, (V2F16)S1, (F32)z)) + SDValue Op1 = N->getOperand(0); + SDValue Op2 = N->getOperand(1); + SDValue FMA = N->getOperand(2); + + if (FMA.getOpcode() != ISD::FMA || + Op1.getOpcode() != ISD::FP_EXTEND || + Op2.getOpcode() != ISD::FP_EXTEND) + return SDValue(); + + // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero, + // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract + // is sufficient to allow generaing fdot2. + const TargetOptions &Options = DAG.getTarget().Options; + if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || + (N->getFlags().hasAllowContract() && + FMA->getFlags().hasAllowContract())) { + Op1 = Op1.getOperand(0); + Op2 = Op2.getOperand(0); + if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + SDValue Vec1 = Op1.getOperand(0); + SDValue Idx1 = Op1.getOperand(1); + SDValue Vec2 = Op2.getOperand(0); + + SDValue FMAOp1 = FMA.getOperand(0); + SDValue FMAOp2 = FMA.getOperand(1); + SDValue FMAAcc = FMA.getOperand(2); + + if (FMAOp1.getOpcode() != ISD::FP_EXTEND || + FMAOp2.getOpcode() != ISD::FP_EXTEND) + return SDValue(); + + FMAOp1 = FMAOp1.getOperand(0); + FMAOp2 = FMAOp2.getOperand(0); + if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + SDValue Vec3 = FMAOp1.getOperand(0); + SDValue Vec4 = FMAOp2.getOperand(0); + SDValue Idx2 = FMAOp1.getOperand(1); + + if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) || + // Idx1 and Idx2 cannot be the same. + Idx1 == Idx2) + return SDValue(); + + if (Vec1 == Vec2 || Vec3 == Vec4) + return SDValue(); + + if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16) + return SDValue(); + + if ((Vec1 == Vec3 && Vec2 == Vec4) || + (Vec1 == Vec4 && Vec2 == Vec3)) + return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc); + } + return SDValue(); +} + SDValue SITargetLowering::performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -6387,23 +7569,49 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, } } - if (CRHS && VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND && - isBoolSGPR(LHS.getOperand(0))) { - // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1 - // setcc (sext from i1 cc), -1, eq|sle|uge) => cc - // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1 - // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc - if ((CRHS->isAllOnesValue() && - (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) || - (CRHS->isNullValue() && - (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE))) - return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), - DAG.getConstant(-1, SL, MVT::i1)); - if ((CRHS->isAllOnesValue() && - (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) || - (CRHS->isNullValue() && - (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT))) - return LHS.getOperand(0); + if (CRHS) { + if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND && + isBoolSGPR(LHS.getOperand(0))) { + // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1 + // setcc (sext from i1 cc), -1, eq|sle|uge) => cc + // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1 + // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc + if ((CRHS->isAllOnesValue() && + (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) || + (CRHS->isNullValue() && + (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE))) + return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), + DAG.getConstant(-1, SL, MVT::i1)); + if ((CRHS->isAllOnesValue() && + (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) || + (CRHS->isNullValue() && + (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT))) + return LHS.getOperand(0); + } + + uint64_t CRHSVal = CRHS->getZExtValue(); + if ((CC == ISD::SETEQ || CC == ISD::SETNE) && + LHS.getOpcode() == ISD::SELECT && + isa<ConstantSDNode>(LHS.getOperand(1)) && + isa<ConstantSDNode>(LHS.getOperand(2)) && + LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) && + isBoolSGPR(LHS.getOperand(0))) { + // Given CT != FT: + // setcc (select cc, CT, CF), CF, eq => xor cc, -1 + // setcc (select cc, CT, CF), CF, ne => cc + // setcc (select cc, CT, CF), CT, ne => xor cc, -1 + // setcc (select cc, CT, CF), CT, eq => cc + uint64_t CT = LHS.getConstantOperandVal(1); + uint64_t CF = LHS.getConstantOperandVal(2); + + if ((CF == CRHSVal && CC == ISD::SETEQ) || + (CT == CRHSVal && CC == ISD::SETNE)) + return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), + DAG.getConstant(-1, SL, MVT::i1)); + if ((CF == CRHSVal && CC == ISD::SETNE) || + (CT == CRHSVal && CC == ISD::SETEQ)) + return LHS.getOperand(0); + } } if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() && @@ -6472,6 +7680,29 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performClampCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); + if (!CSrc) + return SDValue(); + + const APFloat &F = CSrc->getValueAPF(); + APFloat Zero = APFloat::getZero(F.getSemantics()); + APFloat::cmpResult Cmp0 = F.compare(Zero); + if (Cmp0 == APFloat::cmpLessThan || + (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) { + return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); + } + + APFloat One(F.getSemantics(), "1.0"); + APFloat::cmpResult Cmp1 = F.compare(One); + if (Cmp1 == APFloat::cmpGreaterThan) + return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0)); + + return SDValue(CSrc, 0); +} + + SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { @@ -6503,7 +7734,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performMinMaxCombine(N, DCI); break; } - case ISD::LOAD: + case ISD::FMA: + return performFMACombine(N, DCI); + case ISD::LOAD: { + if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI)) + return Widended; + LLVM_FALLTHROUGH; + } case ISD::STORE: case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: @@ -6521,7 +7758,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: case AMDGPUISD::ATOMIC_INC: - case AMDGPUISD::ATOMIC_DEC: // TODO: Target mem intrinsics. + case AMDGPUISD::ATOMIC_DEC: + case AMDGPUISD::ATOMIC_LOAD_FADD: + case AMDGPUISD::ATOMIC_LOAD_FMIN: + case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics. if (DCI.isBeforeLegalize()) break; return performMemSDNodeCombine(cast<MemSDNode>(N), DCI); @@ -6537,11 +7777,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performClassCombine(N, DCI); case ISD::FCANONICALIZE: return performFCanonicalizeCombine(N, DCI); - case AMDGPUISD::FRACT: case AMDGPUISD::RCP: + return performRcpCombine(N, DCI); + case AMDGPUISD::FRACT: case AMDGPUISD::RSQ: case AMDGPUISD::RCP_LEGACY: case AMDGPUISD::RSQ_LEGACY: + case AMDGPUISD::RCP_IFLAG: case AMDGPUISD::RSQ_CLAMP: case AMDGPUISD::LDEXP: { SDValue Src = N->getOperand(0); @@ -6561,6 +7803,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performFMed3Combine(N, DCI); case AMDGPUISD::CVT_PKRTZ_F16_F32: return performCvtPkRTZCombine(N, DCI); + case AMDGPUISD::CLAMP: + return performClampCombine(N, DCI); case ISD::SCALAR_TO_VECTOR: { SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); @@ -6587,7 +7831,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } -/// \brief Helper function for adjustWritemask +/// Helper function for adjustWritemask static unsigned SubIdx2Lane(unsigned Idx) { switch (Idx) { default: return 0; @@ -6598,12 +7842,19 @@ static unsigned SubIdx2Lane(unsigned Idx) { } } -/// \brief Adjust the writemask of MIMG instructions +/// Adjust the writemask of MIMG instructions SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, SelectionDAG &DAG) const { + unsigned Opcode = Node->getMachineOpcode(); + + // Subtract 1 because the vdata output is not a MachineSDNode operand. + int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1; + if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx)) + return Node; // not implemented for D16 + SDNode *Users[4] = { nullptr }; unsigned Lane = 0; - unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3; + unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1; unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); unsigned NewDmask = 0; bool HasChain = Node->getNumValues() > 1; @@ -6653,9 +7904,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, unsigned BitsSet = countPopulation(NewDmask); - const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - int NewOpcode = AMDGPU::getMaskedMIMGOp(*TII, - Node->getMachineOpcode(), BitsSet); + int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet); assert(NewOpcode != -1 && NewOpcode != static_cast<int>(Node->getMachineOpcode()) && "failed to find equivalent MIMG op"); @@ -6720,7 +7969,7 @@ static bool isFrameIndexOp(SDValue Op) { return isa<FrameIndexSDNode>(Op); } -/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG) +/// Legalize target independent instructions (e.g. INSERT_SUBREG) /// with frame index operands. /// LLVM assumes that inputs are to these instructions are registers. SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, @@ -6767,7 +8016,7 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, return DAG.UpdateNodeOperands(Node, Ops); } -/// \brief Fold the instructions after selecting them. +/// Fold the instructions after selecting them. /// Returns null if users were already updated. SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { @@ -6841,7 +8090,7 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, return Node; } -/// \brief Assign the register class depending on the number of +/// Assign the register class depending on the number of /// bits set in the writemask void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const { @@ -6928,7 +8177,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); } -/// \brief Return a resource descriptor with the 'Add TID' bit enabled +/// Return a resource descriptor with the 'Add TID' bit enabled /// The TID (Thread ID) is multiplied by the stride value (bits [61:48] /// of the resource descriptor) to create an offset, which is added to /// the resource pointer. @@ -6970,11 +8219,11 @@ std::pair<unsigned, const TargetRegisterClass *> SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { - if (!isTypeLegal(VT)) - return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); - + const TargetRegisterClass *RC = nullptr; if (Constraint.size() == 1) { switch (Constraint[0]) { + default: + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); case 's': case 'r': switch (VT.getSizeInBits()) { @@ -6982,40 +8231,56 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, nullptr); case 32: case 16: - return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass); + RC = &AMDGPU::SReg_32_XM0RegClass; + break; case 64: - return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); + RC = &AMDGPU::SGPR_64RegClass; + break; case 128: - return std::make_pair(0U, &AMDGPU::SReg_128RegClass); + RC = &AMDGPU::SReg_128RegClass; + break; case 256: - return std::make_pair(0U, &AMDGPU::SReg_256RegClass); + RC = &AMDGPU::SReg_256RegClass; + break; case 512: - return std::make_pair(0U, &AMDGPU::SReg_512RegClass); + RC = &AMDGPU::SReg_512RegClass; + break; } - + break; case 'v': switch (VT.getSizeInBits()) { default: return std::make_pair(0U, nullptr); case 32: case 16: - return std::make_pair(0U, &AMDGPU::VGPR_32RegClass); + RC = &AMDGPU::VGPR_32RegClass; + break; case 64: - return std::make_pair(0U, &AMDGPU::VReg_64RegClass); + RC = &AMDGPU::VReg_64RegClass; + break; case 96: - return std::make_pair(0U, &AMDGPU::VReg_96RegClass); + RC = &AMDGPU::VReg_96RegClass; + break; case 128: - return std::make_pair(0U, &AMDGPU::VReg_128RegClass); + RC = &AMDGPU::VReg_128RegClass; + break; case 256: - return std::make_pair(0U, &AMDGPU::VReg_256RegClass); + RC = &AMDGPU::VReg_256RegClass; + break; case 512: - return std::make_pair(0U, &AMDGPU::VReg_512RegClass); + RC = &AMDGPU::VReg_512RegClass; + break; } + break; } + // We actually support i128, i16 and f16 as inline parameters + // even if they are not reported as legal + if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 || + VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16)) + return std::make_pair(0U, RC); } if (Constraint.size() > 1) { - const TargetRegisterClass *RC = nullptr; if (Constraint[1] == 'v') { RC = &AMDGPU::VGPR_32RegClass; } else if (Constraint[1] == 's') { @@ -7052,8 +8317,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); const MachineFrameInfo &MFI = MF.getFrameInfo(); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); if (Info->isEntryFunction()) { // Callable functions have fixed registers used for stack access. @@ -7083,6 +8347,8 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG, Info->getScratchWaveOffsetReg()); + Info->limitOccupancy(MF); + TargetLoweringBase::finalizeLowering(MF); } @@ -7103,3 +8369,69 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, // calculation won't overflow, so assume the sign bit is never set. Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits); } + +bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, + FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const +{ + switch (N->getOpcode()) { + case ISD::Register: + case ISD::CopyFromReg: + { + const RegisterSDNode *R = nullptr; + if (N->getOpcode() == ISD::Register) { + R = dyn_cast<RegisterSDNode>(N); + } + else { + R = dyn_cast<RegisterSDNode>(N->getOperand(1)); + } + if (R) + { + const MachineFunction * MF = FLI->MF; + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo(); + unsigned Reg = R->getReg(); + if (TRI.isPhysicalRegister(Reg)) + return TRI.isVGPR(MRI, Reg); + + if (MRI.isLiveIn(Reg)) { + // workitem.id.x workitem.id.y workitem.id.z + // Any VGPR formal argument is also considered divergent + if (TRI.isVGPR(MRI, Reg)) + return true; + // Formal arguments of non-entry functions + // are conservatively considered divergent + else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv())) + return true; + } + return !DA || DA->isDivergent(FLI->getValueFromVirtualReg(Reg)); + } + } + break; + case ISD::LOAD: { + const LoadSDNode *L = dyn_cast<LoadSDNode>(N); + if (L->getMemOperand()->getAddrSpace() == + Subtarget->getAMDGPUAS().PRIVATE_ADDRESS) + return true; + } break; + case ISD::CALLSEQ_END: + return true; + break; + case ISD::INTRINSIC_WO_CHAIN: + { + + } + return AMDGPU::isIntrinsicSourceOfDivergence( + cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()); + case ISD::INTRINSIC_W_CHAIN: + return AMDGPU::isIntrinsicSourceOfDivergence( + cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()); + // In some cases intrinsics that are a source of divergence have been + // lowered to AMDGPUISD so we also need to check those too. + case AMDGPUISD::INTERP_MOV: + case AMDGPUISD::INTERP_P1: + case AMDGPUISD::INTERP_P2: + return true; + } + return false; +} diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index b48e67f7563a..ad049f2a71c3 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief SI DAG Lowering interface definition +/// SI DAG Lowering interface definition // //===----------------------------------------------------------------------===// @@ -22,12 +22,15 @@ namespace llvm { class SITargetLowering final : public AMDGPUTargetLowering { +private: + const GCNSubtarget *Subtarget; + SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, uint64_t Offset) const; SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const; SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, - uint64_t Offset, bool Signed, + uint64_t Offset, unsigned Align, bool Signed, const ISD::InputArg *Arg = nullptr) const; SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, @@ -42,10 +45,14 @@ class SITargetLowering final : public AMDGPUTargetLowering { SelectionDAG &DAG) const override; SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, MVT VT, unsigned Offset) const; + SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, + SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; + + SDValue widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const; @@ -60,7 +67,13 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - /// \brief Converts \p Op, which must be of floating point type, to the + SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M, + SelectionDAG &DAG, + bool IsIntrinsic = false) const; + + SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const; + + /// Converts \p Op, which must be of floating point type, to the /// floating point type \p VT, by either extending or truncating it. SDValue getFPExtOrFPTrunc(SelectionDAG &DAG, SDValue Op, @@ -71,7 +84,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val, bool Signed, const ISD::InputArg *Arg = nullptr) const; - /// \brief Custom lowering for ISD::FP_ROUND for MVT::f16. + /// Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue getSegmentAperture(unsigned AS, const SDLoc &DL, @@ -80,7 +93,9 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const; SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; @@ -121,8 +136,11 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const; bool isLegalFlatAddressingMode(const AddrMode &AM) const; bool isLegalGlobalAddressingMode(const AddrMode &AM) const; @@ -145,9 +163,11 @@ class SITargetLowering final : public AMDGPUTargetLowering { bool shouldEmitPCReloc(const GlobalValue *GV) const; public: - SITargetLowering(const TargetMachine &tm, const SISubtarget &STI); + SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI); - const SISubtarget *getSubtarget() const; + const GCNSubtarget *getSubtarget() const; + + bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const override; bool isShuffleMaskLegal(ArrayRef<int> /*Mask*/, EVT /*VT*/) const override; @@ -255,7 +275,10 @@ public: EVT VT) const override; MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override; bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; + SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const; + SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const override; @@ -284,6 +307,9 @@ public: const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth = 0) const override; + + bool isSDNodeSourceOfDivergence(const SDNode *N, + FunctionLoweringInfo *FLI, DivergenceAnalysis *DA) const override; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp index a2f844d7854e..61c8f359e168 100644 --- a/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief This pass inserts branches on the 0 exec mask over divergent branches +/// This pass inserts branches on the 0 exec mask over divergent branches /// branches when it's expected that jumping over the untaken control flow will /// be cheaper than having every workitem no-op through it. // @@ -18,6 +18,7 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -210,65 +211,73 @@ void SIInsertSkips::kill(MachineInstr &MI) { switch (MI.getOperand(2).getImm()) { case ISD::SETOEQ: case ISD::SETEQ: - Opcode = AMDGPU::V_CMPX_EQ_F32_e32; + Opcode = AMDGPU::V_CMPX_EQ_F32_e64; break; case ISD::SETOGT: case ISD::SETGT: - Opcode = AMDGPU::V_CMPX_LT_F32_e32; + Opcode = AMDGPU::V_CMPX_LT_F32_e64; break; case ISD::SETOGE: case ISD::SETGE: - Opcode = AMDGPU::V_CMPX_LE_F32_e32; + Opcode = AMDGPU::V_CMPX_LE_F32_e64; break; case ISD::SETOLT: case ISD::SETLT: - Opcode = AMDGPU::V_CMPX_GT_F32_e32; + Opcode = AMDGPU::V_CMPX_GT_F32_e64; break; case ISD::SETOLE: case ISD::SETLE: - Opcode = AMDGPU::V_CMPX_GE_F32_e32; + Opcode = AMDGPU::V_CMPX_GE_F32_e64; break; case ISD::SETONE: case ISD::SETNE: - Opcode = AMDGPU::V_CMPX_LG_F32_e32; + Opcode = AMDGPU::V_CMPX_LG_F32_e64; break; case ISD::SETO: - Opcode = AMDGPU::V_CMPX_O_F32_e32; + Opcode = AMDGPU::V_CMPX_O_F32_e64; break; case ISD::SETUO: - Opcode = AMDGPU::V_CMPX_U_F32_e32; + Opcode = AMDGPU::V_CMPX_U_F32_e64; break; case ISD::SETUEQ: - Opcode = AMDGPU::V_CMPX_NLG_F32_e32; + Opcode = AMDGPU::V_CMPX_NLG_F32_e64; break; case ISD::SETUGT: - Opcode = AMDGPU::V_CMPX_NGE_F32_e32; + Opcode = AMDGPU::V_CMPX_NGE_F32_e64; break; case ISD::SETUGE: - Opcode = AMDGPU::V_CMPX_NGT_F32_e32; + Opcode = AMDGPU::V_CMPX_NGT_F32_e64; break; case ISD::SETULT: - Opcode = AMDGPU::V_CMPX_NLE_F32_e32; + Opcode = AMDGPU::V_CMPX_NLE_F32_e64; break; case ISD::SETULE: - Opcode = AMDGPU::V_CMPX_NLT_F32_e32; + Opcode = AMDGPU::V_CMPX_NLT_F32_e64; break; case ISD::SETUNE: - Opcode = AMDGPU::V_CMPX_NEQ_F32_e32; + Opcode = AMDGPU::V_CMPX_NEQ_F32_e64; break; default: llvm_unreachable("invalid ISD:SET cond code"); } - // TODO: Allow this: - if (!MI.getOperand(0).isReg() || - !TRI->isVGPR(MBB.getParent()->getRegInfo(), - MI.getOperand(0).getReg())) - llvm_unreachable("SI_KILL operand should be a VGPR"); - - BuildMI(MBB, &MI, DL, TII->get(Opcode)) - .add(MI.getOperand(1)) - .add(MI.getOperand(0)); + assert(MI.getOperand(0).isReg()); + + if (TRI->isVGPR(MBB.getParent()->getRegInfo(), + MI.getOperand(0).getReg())) { + Opcode = AMDGPU::getVOPe32(Opcode); + BuildMI(MBB, &MI, DL, TII->get(Opcode)) + .add(MI.getOperand(1)) + .add(MI.getOperand(0)); + } else { + BuildMI(MBB, &MI, DL, TII->get(Opcode)) + .addReg(AMDGPU::VCC, RegState::Define) + .addImm(0) // src0 modifiers + .add(MI.getOperand(1)) + .addImm(0) // src1 modifiers + .add(MI.getOperand(0)) + .addImm(0); // omod + } break; } case AMDGPU::SI_KILL_I1_TERMINATOR: { @@ -330,7 +339,7 @@ bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, } bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); SkipThreshold = SkipThresholdFlag; diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 6bbe5979316d..d456e3d9b94d 100644 --- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Insert wait instructions for memory reads and writes. +/// Insert wait instructions for memory reads and writes. /// /// Memory reads and writes are issued asynchronously, so we need to insert /// S_WAITCNT instructions when we want to access any of their results or @@ -40,6 +40,7 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/DebugCounter.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> @@ -50,9 +51,21 @@ #include <utility> #include <vector> +using namespace llvm; + #define DEBUG_TYPE "si-insert-waitcnts" -using namespace llvm; +DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", + "Force emit s_waitcnt expcnt(0) instrs"); +DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm", + "Force emit s_waitcnt lgkmcnt(0) instrs"); +DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm", + "Force emit s_waitcnt vmcnt(0) instrs"); + +static cl::opt<unsigned> ForceEmitZeroFlag( + "amdgpu-waitcnt-forcezero", + cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), + cl::init(0), cl::Hidden); namespace { @@ -115,15 +128,15 @@ enum RegisterMapping { (w) = (enum WaitEventType)((w) + 1)) // This is a per-basic-block object that maintains current score brackets -// of each wait-counter, and a per-register scoreboard for each wait-couner. +// of each wait counter, and a per-register scoreboard for each wait counter. // We also maintain the latest score for every event type that can change the // waitcnt in order to know if there are multiple types of events within // the brackets. When multiple types of event happen in the bracket, -// wait-count may get decreased out of order, therefore we need to put in +// wait count may get decreased out of order, therefore we need to put in // "s_waitcnt 0" before use. class BlockWaitcntBrackets { public: - BlockWaitcntBrackets() { + BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) { for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; T = (enum InstCounterType)(T + 1)) { memset(VgprScores[T], 0, sizeof(VgprScores[T])); @@ -301,6 +314,7 @@ public: void dump() { print(dbgs()); } private: + const GCNSubtarget *ST = nullptr; bool WaitAtBeginning = false; bool RevisitLoop = false; bool MixedExpTypes = false; @@ -332,14 +346,12 @@ public: void incIterCnt() { IterCnt++; } void resetIterCnt() { IterCnt = 0; } - int32_t getIterCnt() { return IterCnt; } + unsigned getIterCnt() { return IterCnt; } void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; } MachineInstr *getWaitcnt() const { return LfWaitcnt; } - void print() { - DEBUG(dbgs() << " iteration " << IterCnt << '\n';); - } + void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt << '\n';); } private: // s_waitcnt added at the end of loop footer to stablize wait scores @@ -352,7 +364,7 @@ private: class SIInsertWaitcnts : public MachineFunctionPass { private: - const SISubtarget *ST = nullptr; + const GCNSubtarget *ST = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; const MachineRegisterInfo *MRI = nullptr; @@ -361,22 +373,31 @@ private: AMDGPUAS AMDGPUASI; DenseSet<MachineBasicBlock *> BlockVisitedSet; - DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet; + DenseSet<MachineInstr *> TrackedWaitcntSet; DenseSet<MachineInstr *> VCCZBugHandledSet; DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>> BlockWaitcntBracketsMap; - DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet; + std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet; DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap; std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets; + // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0 + // because of amdgpu-waitcnt-forcezero flag + bool ForceEmitZeroWaitcnts; + bool ForceEmitWaitcnt[NUM_INST_CNTS]; + public: static char ID; - SIInsertWaitcnts() : MachineFunctionPass(ID) {} + SIInsertWaitcnts() : MachineFunctionPass(ID) { + (void)ForceExpCounter; + (void)ForceLgkmCounter; + (void)ForceVMCounter; + } bool runOnMachineFunction(MachineFunction &MF) override; @@ -397,15 +418,53 @@ public: llvm::make_unique<BlockWaitcntBrackets>(*Bracket)); } + bool isForceEmitWaitcnt() const { + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) + if (ForceEmitWaitcnt[T]) + return true; + return false; + } + + void setForceEmitWaitcnt() { +// For non-debug builds, ForceEmitWaitcnt has been initialized to false; +// For debug builds, get the debug counter info and adjust if need be +#ifndef NDEBUG + if (DebugCounter::isCounterSet(ForceExpCounter) && + DebugCounter::shouldExecute(ForceExpCounter)) { + ForceEmitWaitcnt[EXP_CNT] = true; + } else { + ForceEmitWaitcnt[EXP_CNT] = false; + } + + if (DebugCounter::isCounterSet(ForceLgkmCounter) && + DebugCounter::shouldExecute(ForceLgkmCounter)) { + ForceEmitWaitcnt[LGKM_CNT] = true; + } else { + ForceEmitWaitcnt[LGKM_CNT] = false; + } + + if (DebugCounter::isCounterSet(ForceVMCounter) && + DebugCounter::shouldExecute(ForceVMCounter)) { + ForceEmitWaitcnt[VM_CNT] = true; + } else { + ForceEmitWaitcnt[VM_CNT] = false; + } +#endif // NDEBUG + } + bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; - MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI, - BlockWaitcntBrackets *ScoreBrackets); - void updateEventWaitCntAfter(MachineInstr &Inst, + void generateWaitcntInstBefore(MachineInstr &MI, + BlockWaitcntBrackets *ScoreBrackets); + void updateEventWaitcntAfter(MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets); void mergeInputScoreBrackets(MachineBasicBlock &Block); - MachineBasicBlock *loopBottom(const MachineLoop *Loop); + bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block); + unsigned countNumBottomBlocks(const MachineLoop *Loop); void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block); void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst); + bool isWaitcntStronger(unsigned LHS, unsigned RHS); + unsigned combineWaitcnt(unsigned LHS, unsigned RHS); }; } // end anonymous namespace @@ -459,7 +518,7 @@ void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI, const MachineRegisterInfo *MRI, unsigned OpNo, int32_t Val) { RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false); - DEBUG({ + LLVM_DEBUG({ const MachineOperand &Opnd = MI->getOperand(OpNo); assert(TRI->isVGPR(*MRI, Opnd.getReg())); }); @@ -681,14 +740,17 @@ unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T, const int32_t LB = getScoreLB(T); const int32_t UB = getScoreUB(T); if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { - if (T == VM_CNT && hasPendingFlat()) { - // If there is a pending FLAT operation, and this is a VM waitcnt, - // then we need to force a waitcnt 0 for VM. + if ((T == VM_CNT || T == LGKM_CNT) && + hasPendingFlat() && + !ST->hasFlatLgkmVMemCountInOrder()) { + // If there is a pending FLAT operation, and this is a VMem or LGKM + // waitcnt and the target can report early completion, then we need + // to force a waitcnt 0. NeedWait = CNT_MASK(T); setScoreLB(T, getScoreUB(T)); } else if (counterOutOfOrder(T)) { // Counter can get decremented out-of-order when there - // are multiple types event in the brack. Also emit an s_wait counter + // are multiple types event in the bracket. Also emit an s_wait counter // with a conservative value of 0 for the counter. NeedWait = CNT_MASK(T); setScoreLB(T, getScoreUB(T)); @@ -789,7 +851,30 @@ static bool readsVCCZ(const MachineInstr &MI) { !MI.getOperand(1).isUndef(); } -/// \brief Generate s_waitcnt instruction to be placed before cur_Inst. +/// Given wait count encodings checks if LHS is stronger than RHS. +bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) { + if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS)) + return false; + if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS)) + return false; + if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS)) + return false; + return true; +} + +/// Given wait count encodings create a new encoding which is stronger +/// or equal to both. +unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) { + unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS), + AMDGPU::decodeVmcnt(IV, RHS)); + unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS), + AMDGPU::decodeLgkmcnt(IV, RHS)); + unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS), + AMDGPU::decodeExpcnt(IV, RHS)); + return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt); +} + +/// Generate s_waitcnt instruction to be placed before cur_Inst. /// Instructions of a given type are returned in order, /// but instructions of different types can complete out of order. /// We rely on this in-order completion @@ -799,23 +884,29 @@ static bool readsVCCZ(const MachineInstr &MI) { /// and if so what the value of each counter is. /// The "score bracket" is bound by the lower bound and upper bound /// scores (*_score_LB and *_score_ub respectively). -MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( +void SIInsertWaitcnts::generateWaitcntInstBefore( MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) { // To emit, or not to emit - that's the question! // Start with an assumption that there is no need to emit. - unsigned int EmitSwaitcnt = 0; - // s_waitcnt instruction to return; default is NULL. - MachineInstr *SWaitInst = nullptr; + unsigned int EmitWaitcnt = 0; + // No need to wait before phi. If a phi-move exists, then the wait should // has been inserted before the move. If a phi-move does not exist, then // wait should be inserted before the real use. The same is true for // sc-merge. It is not a coincident that all these cases correspond to the // instructions that are skipped in the assembling loop. bool NeedLineMapping = false; // TODO: Check on this. - if (MI.isDebugValue() && + + // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug + bool ForceEmitZeroWaitcnt = false; + + setForceEmitWaitcnt(); + bool IsForceEmitWaitcnt = isForceEmitWaitcnt(); + + if (MI.isDebugInstr() && // TODO: any other opcode? !NeedLineMapping) { - return SWaitInst; + return; } // See if an s_waitcnt is forced at block entry, or is needed at @@ -826,7 +917,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( ScoreBrackets->clearWaitAtBeginning(); for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; T = (enum InstCounterType)(T + 1)) { - EmitSwaitcnt |= CNT_MASK(T); + EmitWaitcnt |= CNT_MASK(T); ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); } } @@ -836,21 +927,20 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) { - EmitSwaitcnt |= + EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); } // All waits must be resolved at call return. // NOTE: this could be improved with knowledge of all call sites or // with knowledge of the called routines. - if (MI.getOpcode() == AMDGPU::RETURN || - MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || + if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || MI.getOpcode() == AMDGPU::S_SETPC_B64_return) { for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; T = (enum InstCounterType)(T + 1)) { if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); - EmitSwaitcnt |= CNT_MASK(T); + EmitWaitcnt |= CNT_MASK(T); } } } @@ -861,7 +951,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( AMDGPU::SendMsg::ID_GS_DONE)) { if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) { ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); - EmitSwaitcnt |= CNT_MASK(VM_CNT); + EmitWaitcnt |= CNT_MASK(VM_CNT); } } #if 0 // TODO: the following blocks of logic when we have fence. @@ -879,11 +969,11 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( case SCMEM_LDS: if (group_is_multi_wave || context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) { - EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, + EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT)); // LDS may have to wait for VM_CNT after buffer load to LDS if (target_info->HasBufferLoadToLDS()) { - EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT, + EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); } } @@ -891,9 +981,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( case SCMEM_GDS: if (group_is_multi_wave || fence_is_global) { - EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, + EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, + EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT)); } break; @@ -903,9 +993,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( case SCMEM_RING: case SCMEM_SCATTER: if (group_is_multi_wave || fence_is_global) { - EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, + EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT, + EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); } break; @@ -926,13 +1016,13 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { // Export and GDS are tracked individually, either may trigger a waitcnt // for EXEC. - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK)); - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS)); - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS)); - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK)); } @@ -947,7 +1037,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( if (ScoreBrackets->getScoreUB(EXP_CNT) > ScoreBrackets->getScoreLB(EXP_CNT)) { ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - EmitSwaitcnt |= CNT_MASK(EXP_CNT); + EmitWaitcnt |= CNT_MASK(EXP_CNT); } } #endif @@ -965,7 +1055,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( continue; unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; // VM_CNT is only relevant to vgpr or LDS. - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); } @@ -977,10 +1067,10 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { if (TRI->isVGPR(MRIA, Op.getReg())) { // VM_CNT is only relevant to vgpr or LDS. - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); } - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); } } @@ -999,9 +1089,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( if (AS != AMDGPUASI.LOCAL_ADDRESS) continue; unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT)); } } @@ -1012,38 +1102,35 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true); for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { if (TRI->isVGPR(MRIA, Def.getReg())) { - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT)); } - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); } } // End of for loop that looks at all dest operands. } - // TODO: Tie force zero to a compiler triage option. - bool ForceZero = false; - // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0 // occurs before the instruction. Doing it here prevents any additional // S_WAITCNTs from being emitted if the instruction was marked as // requiring a WAITCNT beforehand. if (MI.getOpcode() == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier()) { - EmitSwaitcnt |= + EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT)); } // TODO: Remove this work-around, enable the assert for Bug 457939 // after fixing the scheduler. Also, the Shader Compiler code is // independent of target. - if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) { + if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) { if (ScoreBrackets->getScoreLB(LGKM_CNT) < ScoreBrackets->getScoreUB(LGKM_CNT) && ScoreBrackets->hasPendingSMEM()) { @@ -1052,17 +1139,20 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( // block, so if we only wait on LGKM here, we might end up with // another s_waitcnt inserted right after this if there are non-LGKM // instructions still outstanding. - ForceZero = true; - EmitSwaitcnt = true; + // FIXME: this is too conservative / the comment is wrong. + // We don't wait on everything at the end of the block and we combine + // waitcnts so we should never have back-to-back waitcnts. + ForceEmitZeroWaitcnt = true; + EmitWaitcnt = true; } } // Does this operand processing indicate s_wait counter update? - if (EmitSwaitcnt) { + if (EmitWaitcnt || IsForceEmitWaitcnt) { int CntVal[NUM_INST_CNTS]; bool UseDefaultWaitcntStrategy = true; - if (ForceZero) { + if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) { // Force all waitcnts to 0. for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; T = (enum InstCounterType)(T + 1)) { @@ -1077,7 +1167,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( if (UseDefaultWaitcntStrategy) { for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; T = (enum InstCounterType)(T + 1)) { - if (EmitSwaitcnt & CNT_MASK(T)) { + if (EmitWaitcnt & CNT_MASK(T)) { int Delta = ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T); int MaxDelta = ScoreBrackets->getWaitCountMax(T); @@ -1087,7 +1177,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( ScoreBrackets->setScoreLB( T, ScoreBrackets->getScoreUB(T) - MaxDelta); } - EmitSwaitcnt &= ~CNT_MASK(T); + EmitWaitcnt &= ~CNT_MASK(T); } CntVal[T] = Delta; } else { @@ -1099,10 +1189,11 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( } // If we are not waiting on any counter we can skip the wait altogether. - if (EmitSwaitcnt != 0) { + if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) { MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt(); int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm(); - if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) != + if (!OldWaitcnt || + (AMDGPU::decodeVmcnt(IV, Imm) != (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) || (AMDGPU::decodeExpcnt(IV, Imm) != (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) || @@ -1114,39 +1205,80 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( BlockWaitcntBrackets *ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); if (!ScoreBracket) { - assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end()); + assert(!BlockVisitedSet.count(TBB)); BlockWaitcntBracketsMap[TBB] = - llvm::make_unique<BlockWaitcntBrackets>(); + llvm::make_unique<BlockWaitcntBrackets>(ST); ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); } ScoreBracket->setRevisitLoop(true); - DEBUG(dbgs() << "set-revisit: block" - << ContainingLoop->getHeader()->getNumber() << '\n';); + LLVM_DEBUG(dbgs() + << "set-revisit2: Block" + << ContainingLoop->getHeader()->getNumber() << '\n';); } } // Update an existing waitcount, or make a new one. - MachineFunction &MF = *MI.getParent()->getParent(); - if (OldWaitcnt && OldWaitcnt->getOpcode() != AMDGPU::S_WAITCNT) { - SWaitInst = OldWaitcnt; - } else { - SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT), - MI.getDebugLoc()); - CompilerGeneratedWaitcntSet.insert(SWaitInst); - } + unsigned Enc = AMDGPU::encodeWaitcnt(IV, + ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT], + ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT], + ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]); + // We don't remove waitcnts that existed prior to the waitcnt + // pass. Check if the waitcnt to-be-inserted can be avoided + // or if the prev waitcnt can be updated. + bool insertSWaitInst = true; + for (MachineBasicBlock::iterator I = MI.getIterator(), + B = MI.getParent()->begin(); + insertSWaitInst && I != B; --I) { + if (I == MI.getIterator()) + continue; - const MachineOperand &Op = - MachineOperand::CreateImm(AMDGPU::encodeWaitcnt( - IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT])); - SWaitInst->addOperand(MF, Op); + switch (I->getOpcode()) { + case AMDGPU::S_WAITCNT: + if (isWaitcntStronger(I->getOperand(0).getImm(), Enc)) + insertSWaitInst = false; + else if (!OldWaitcnt) { + OldWaitcnt = &*I; + Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc); + } + break; + // TODO: skip over instructions which never require wait. + } + break; + } + if (insertSWaitInst) { + if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) { + if (ForceEmitZeroWaitcnts) + LLVM_DEBUG( + dbgs() + << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n"); + if (IsForceEmitWaitcnt) + LLVM_DEBUG(dbgs() + << "Force emit a s_waitcnt due to debug counter\n"); + + OldWaitcnt->getOperand(0).setImm(Enc); + if (!OldWaitcnt->getParent()) + MI.getParent()->insert(MI, OldWaitcnt); + + LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n" + << "Old Instr: " << MI << '\n' + << "New Instr: " << *OldWaitcnt << '\n'); + } else { + auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(), + MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(Enc); + TrackedWaitcntSet.insert(SWaitInst); + + LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n" + << "Old Instr: " << MI << '\n' + << "New Instr: " << *SWaitInst << '\n'); + } + } if (CntVal[EXP_CNT] == 0) { ScoreBrackets->setMixedExpTypes(false); } } } - - return SWaitInst; } void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB, @@ -1180,7 +1312,7 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { return false; } -void SIInsertWaitcnts::updateEventWaitCntAfter( +void SIInsertWaitcnts::updateEventWaitcntAfter( MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) { // Now look at the instruction opcode. If it is a memory access // instruction, update the upper-bound of the appropriate counter's @@ -1214,7 +1346,7 @@ void SIInsertWaitcnts::updateEventWaitCntAfter( Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC && Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) { ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); - if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() && + if (ST->vmemWriteNeedsExpWaitcnt() && (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) { ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst); } @@ -1247,27 +1379,37 @@ void SIInsertWaitcnts::updateEventWaitCntAfter( } } +// Merge the score brackets of the Block's predecessors; +// this merged score bracket is used when adding waitcnts to the Block void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) { BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get(); int32_t MaxPending[NUM_INST_CNTS] = {0}; int32_t MaxFlat[NUM_INST_CNTS] = {0}; bool MixedExpTypes = false; - // Clear the score bracket state. - ScoreBrackets->clear(); - - // Compute the number of pending elements on block entry. + // For single basic block loops, we need to retain the Block's + // score bracket to have accurate Pred info. So, make a copy of Block's + // score bracket, clear() it (which retains several important bits of info), + // populate, and then replace en masse. For non-single basic block loops, + // just clear Block's current score bracket and repopulate in-place. + bool IsSelfPred; + std::unique_ptr<BlockWaitcntBrackets> S; + + IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block)) + != Block.pred_end(); + if (IsSelfPred) { + S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets); + ScoreBrackets = S.get(); + } - // IMPORTANT NOTE: If iterative handling of loops is added, the code will - // need to handle single BBs with backedges to themselves. This means that - // they will need to retain and not clear their initial state. + ScoreBrackets->clear(); // See if there are any uninitialized predecessors. If so, emit an // s_waitcnt 0 at the beginning of the block. - for (MachineBasicBlock *pred : Block.predecessors()) { + for (MachineBasicBlock *Pred : Block.predecessors()) { BlockWaitcntBrackets *PredScoreBrackets = - BlockWaitcntBracketsMap[pred].get(); - bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end(); + BlockWaitcntBracketsMap[Pred].get(); + bool Visited = BlockVisitedSet.count(Pred); if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { continue; } @@ -1306,7 +1448,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) { for (MachineBasicBlock *Pred : Block.predecessors()) { BlockWaitcntBrackets *PredScoreBrackets = BlockWaitcntBracketsMap[Pred].get(); - bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end(); + bool Visited = BlockVisitedSet.count(Pred); if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { continue; } @@ -1354,7 +1496,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) { // Set the register scoreboard. for (MachineBasicBlock *Pred : Block.predecessors()) { - if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) { + if (!BlockVisitedSet.count(Pred)) { continue; } @@ -1468,7 +1610,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) { // sequencing predecessors, because changes to EXEC require waitcnts due to // the delayed nature of these operations. for (MachineBasicBlock *Pred : Block.predecessors()) { - if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) { + if (!BlockVisitedSet.count(Pred)) { continue; } @@ -1496,17 +1638,36 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) { } } } + + // if a single block loop, update the score brackets. Not needed for other + // blocks, as we did this in-place + if (IsSelfPred) { + BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets); + } } -/// Return the "bottom" block of a loop. This differs from -/// MachineLoop::getBottomBlock in that it works even if the loop is -/// discontiguous. -MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) { - MachineBasicBlock *Bottom = Loop->getHeader(); - for (MachineBasicBlock *MBB : Loop->blocks()) - if (MBB->getNumber() > Bottom->getNumber()) - Bottom = MBB; - return Bottom; +/// Return true if the given basic block is a "bottom" block of a loop. +/// This works even if the loop is discontiguous. This also handles +/// multiple back-edges for the same "header" block of a loop. +bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop, + const MachineBasicBlock *Block) { + for (MachineBasicBlock *MBB : Loop->blocks()) { + if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) { + return true; + } + } + return false; +} + +/// Count the number of "bottom" basic blocks of a loop. +unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) { + unsigned Count = 0; + for (MachineBasicBlock *MBB : Loop->blocks()) { + if (MBB->isSuccessor(Loop->getHeader())) { + Count++; + } + } + return Count; } // Generate s_waitcnt instructions where needed. @@ -1517,8 +1678,8 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get(); - DEBUG({ - dbgs() << "Block" << Block.getNumber(); + LLVM_DEBUG({ + dbgs() << "*** Block" << Block.getNumber() << " ***"; ScoreBrackets->dump(); }); @@ -1528,16 +1689,16 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, MachineInstr &Inst = *Iter; // Remove any previously existing waitcnts. if (Inst.getOpcode() == AMDGPU::S_WAITCNT) { - // TODO: Register the old waitcnt and optimize the following waitcnts. - // Leaving the previously existing waitcnts is conservatively correct. - if (CompilerGeneratedWaitcntSet.find(&Inst) == - CompilerGeneratedWaitcntSet.end()) + // Leave pre-existing waitcnts, but note their existence via setWaitcnt. + // Remove the waitcnt-pass-generated waitcnts; the pass will add them back + // as needed. + if (!TrackedWaitcntSet.count(&Inst)) ++Iter; else { - ScoreBrackets->setWaitcnt(&Inst); ++Iter; Inst.removeFromParent(); } + ScoreBrackets->setWaitcnt(&Inst); continue; } @@ -1550,29 +1711,20 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, bool VCCZBugWorkAround = false; if (readsVCCZ(Inst) && - (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) { + (!VCCZBugHandledSet.count(&Inst))) { if (ScoreBrackets->getScoreLB(LGKM_CNT) < ScoreBrackets->getScoreUB(LGKM_CNT) && ScoreBrackets->hasPendingSMEM()) { - if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) + if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) VCCZBugWorkAround = true; } } // Generate an s_waitcnt instruction to be placed before // cur_Inst, if needed. - MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets); - - if (SWaitInst) { - Block.insert(Inst, SWaitInst); - if (ScoreBrackets->getWaitcnt() != SWaitInst) { - DEBUG(dbgs() << "insertWaitcntInBlock\n" - << "Old Instr: " << Inst << '\n' - << "New Instr: " << *SWaitInst << '\n';); - } - } + generateWaitcntInstBefore(Inst, ScoreBrackets); - updateEventWaitCntAfter(Inst, ScoreBrackets); + updateEventWaitcntAfter(Inst, ScoreBrackets); #if 0 // TODO: implement resource type check controlled by options with ub = LB. // If this instruction generates a S_SETVSKIP because it is an @@ -1587,10 +1739,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, ScoreBrackets->clearWaitcnt(); - if (SWaitInst) { - DEBUG({ SWaitInst->print(dbgs() << '\n'); }); - } - DEBUG({ + LLVM_DEBUG({ Inst.print(dbgs()); ScoreBrackets->dump(); }); @@ -1627,21 +1776,22 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, // Check if we need to force convergence at loop footer. MachineLoop *ContainingLoop = MLI->getLoopFor(&Block); - if (ContainingLoop && loopBottom(ContainingLoop) == &Block) { + if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) { LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); WaitcntData->print(); - DEBUG(dbgs() << '\n';); + LLVM_DEBUG(dbgs() << '\n';); // The iterative waitcnt insertion algorithm aims for optimal waitcnt - // placement and doesn't always guarantee convergence for a loop. Each - // loop should take at most 2 iterations for it to converge naturally. - // When this max is reached and result doesn't converge, we force - // convergence by inserting a s_waitcnt at the end of loop footer. - if (WaitcntData->getIterCnt() > 2) { + // placement, but doesn't guarantee convergence for a loop. Each + // loop should take at most (n+1) iterations for it to converge naturally, + // where n is the number of bottom blocks. If this threshold is reached and + // the result hasn't converged, then we force convergence by inserting + // a s_waitcnt at the end of loop footer. + if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) { // To ensure convergence, need to make wait events at loop footer be no // more than those from the previous iteration. - // As a simplification, Instead of tracking individual scores and - // generate the precise wait count, just wait on 0. + // As a simplification, instead of tracking individual scores and + // generating the precise wait count, just wait on 0. bool HasPending = false; MachineInstr *SWaitInst = WaitcntData->getWaitcnt(); for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; @@ -1649,16 +1799,16 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); HasPending = true; + break; } } if (HasPending) { if (!SWaitInst) { - SWaitInst = Block.getParent()->CreateMachineInstr( - TII->get(AMDGPU::S_WAITCNT), DebugLoc()); - CompilerGeneratedWaitcntSet.insert(SWaitInst); - const MachineOperand &Op = MachineOperand::CreateImm(0); - SWaitInst->addOperand(MF, Op); + SWaitInst = BuildMI(Block, Block.getFirstNonPHI(), + DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + TrackedWaitcntSet.insert(SWaitInst); #if 0 // TODO: Format the debug output OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context); OutputTransformAdd(SWaitInst, context); @@ -1670,7 +1820,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } if (SWaitInst) { - DEBUG({ + LLVM_DEBUG({ SWaitInst->print(dbgs()); dbgs() << "\nAdjusted score board:"; ScoreBrackets->dump(); @@ -1678,7 +1828,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, // Add this waitcnt to the block. It is either newly created or // created in previous iterations and added back since block traversal - // always remove waitcnt. + // always removes waitcnts. insertWaitcntBeforeCF(Block, SWaitInst); WaitcntData->setWaitcnt(SWaitInst); } @@ -1687,7 +1837,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { - ST = &MF.getSubtarget<SISubtarget>(); + ST = &MF.getSubtarget<GCNSubtarget>(); TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); @@ -1696,6 +1846,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); AMDGPUASI = ST->getAMDGPUAS(); + ForceEmitZeroWaitcnts = ForceEmitZeroFlag; + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) + ForceEmitWaitcnt[T] = false; + HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV); HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); @@ -1712,6 +1867,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1; + TrackedWaitcntSet.clear(); + BlockVisitedSet.clear(); + VCCZBugHandledSet.clear(); + LoopWaitcntDataMap.clear(); + BlockWaitcntProcessedSet.clear(); + // Walk over the blocks in reverse post-dominator order, inserting // s_waitcnt where needed. ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); @@ -1726,7 +1887,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get(); if (!ScoreBrackets) { - BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(); + BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST); ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get(); } ScoreBrackets->setPostOrder(MBB.getNumber()); @@ -1737,22 +1898,30 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { // If we are walking into the block from before the loop, then guarantee // at least 1 re-walk over the loop to propagate the information, even if // no S_WAITCNT instructions were generated. - if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I && - (BlockWaitcntProcessedSet.find(&MBB) == - BlockWaitcntProcessedSet.end())) { - BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true); - DEBUG(dbgs() << "set-revisit: block" - << ContainingLoop->getHeader()->getNumber() << '\n';); + if (ContainingLoop && ContainingLoop->getHeader() == &MBB) { + unsigned Count = countNumBottomBlocks(ContainingLoop); + + // If the loop has multiple back-edges, and so more than one "bottom" + // basic block, we have to guarantee a re-walk over every blocks. + if ((std::count(BlockWaitcntProcessedSet.begin(), + BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) { + BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true); + LLVM_DEBUG(dbgs() << "set-revisit1: Block" + << ContainingLoop->getHeader()->getNumber() << '\n';); + } } // Walk over the instructions. insertWaitcntInBlock(MF, MBB); - // Flag that waitcnts have been processed at least once. - BlockWaitcntProcessedSet.insert(&MBB); + // Record that waitcnts have been processed at least once for this block. + BlockWaitcntProcessedSet.push_back(&MBB); - // See if we want to revisit the loop. - if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) { + // See if we want to revisit the loop. If a loop has multiple back-edges, + // we shouldn't revisit the same "bottom" basic block. + if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) && + std::count(BlockWaitcntProcessedSet.begin(), + BlockWaitcntProcessedSet.end(), &MBB) == 1) { MachineBasicBlock *EntryBB = ContainingLoop->getHeader(); BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get(); if (EntrySB && EntrySB->getRevisitLoop()) { @@ -1772,7 +1941,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { } LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); WaitcntData->incIterCnt(); - DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';); + LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';); continue; } else { LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); @@ -1837,7 +2006,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may - // depend on. We can't track them and it's better to to the wait after the + // depend on. We can't track them and it's better to the wait after the // costly call sequence. // TODO: Could insert earlier and schedule more liberally with operations diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp deleted file mode 100644 index b074b95c2d3c..000000000000 --- a/lib/Target/AMDGPU/SIInsertWaits.cpp +++ /dev/null @@ -1,703 +0,0 @@ -//===- SILowerControlFlow.cpp - Use predicates for control flow -----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Insert wait instructions for memory reads and writes. -/// -/// Memory reads and writes are issued asynchronously, so we need to insert -/// S_WAITCNT instructions when we want to access any of their results or -/// overwrite any register that's used asynchronously. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIDefines.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "SIRegisterInfo.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineOperand.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/DebugLoc.h" -#include "llvm/MC/MCInstrDesc.h" -#include "llvm/Pass.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include <algorithm> -#include <cassert> -#include <cstdint> -#include <cstring> -#include <utility> - -#define DEBUG_TYPE "si-insert-waits" - -using namespace llvm; - -namespace { - -/// \brief One variable for each of the hardware counters -using Counters = union { - struct { - unsigned VM; - unsigned EXP; - unsigned LGKM; - } Named; - unsigned Array[3]; -}; - -using InstType = enum { - OTHER, - SMEM, - VMEM -}; - -using RegCounters = Counters[512]; -using RegInterval = std::pair<unsigned, unsigned>; - -class SIInsertWaits : public MachineFunctionPass { -private: - const SISubtarget *ST = nullptr; - const SIInstrInfo *TII = nullptr; - const SIRegisterInfo *TRI = nullptr; - const MachineRegisterInfo *MRI; - AMDGPU::IsaInfo::IsaVersion ISA; - - /// \brief Constant zero value - static const Counters ZeroCounts; - - /// \brief Hardware limits - Counters HardwareLimits; - - /// \brief Counter values we have already waited on. - Counters WaitedOn; - - /// \brief Counter values that we must wait on before the next counter - /// increase. - Counters DelayedWaitOn; - - /// \brief Counter values for last instruction issued. - Counters LastIssued; - - /// \brief Registers used by async instructions. - RegCounters UsedRegs; - - /// \brief Registers defined by async instructions. - RegCounters DefinedRegs; - - /// \brief Different export instruction types seen since last wait. - unsigned ExpInstrTypesSeen = 0; - - /// \brief Type of the last opcode. - InstType LastOpcodeType; - - bool LastInstWritesM0; - - /// Whether or not we have flat operations outstanding. - bool IsFlatOutstanding; - - /// \brief Whether the machine function returns void - bool ReturnsVoid; - - /// Whether the VCCZ bit is possibly corrupt - bool VCCZCorrupt = false; - - /// \brief Get increment/decrement amount for this instruction. - Counters getHwCounts(MachineInstr &MI); - - /// \brief Is operand relevant for async execution? - bool isOpRelevant(MachineOperand &Op); - - /// \brief Get register interval an operand affects. - RegInterval getRegInterval(const TargetRegisterClass *RC, - const MachineOperand &Reg) const; - - /// \brief Handle instructions async components - void pushInstruction(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const Counters& Increment); - - /// \brief Insert the actual wait instruction - bool insertWait(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const Counters &Counts); - - /// \brief Handle existing wait instructions (from intrinsics) - void handleExistingWait(MachineBasicBlock::iterator I); - - /// \brief Do we need def2def checks? - bool unorderedDefines(MachineInstr &MI); - - /// \brief Resolve all operand dependencies to counter requirements - Counters handleOperands(MachineInstr &MI); - - /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG. - void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); - - /// Return true if there are LGKM instrucitons that haven't been waited on - /// yet. - bool hasOutstandingLGKM() const; - -public: - static char ID; - - SIInsertWaits() : MachineFunctionPass(ID) {} - - bool runOnMachineFunction(MachineFunction &MF) override; - - StringRef getPassName() const override { - return "SI insert wait instructions"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // end anonymous namespace - -INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE, - "SI Insert Waits", false, false) -INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE, - "SI Insert Waits", false, false) - -char SIInsertWaits::ID = 0; - -char &llvm::SIInsertWaitsID = SIInsertWaits::ID; - -FunctionPass *llvm::createSIInsertWaitsPass() { - return new SIInsertWaits(); -} - -const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } }; - -static bool readsVCCZ(const MachineInstr &MI) { - unsigned Opc = MI.getOpcode(); - return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && - !MI.getOperand(1).isUndef(); -} - -bool SIInsertWaits::hasOutstandingLGKM() const { - return WaitedOn.Named.LGKM != LastIssued.Named.LGKM; -} - -Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { - uint64_t TSFlags = MI.getDesc().TSFlags; - Counters Result = { { 0, 0, 0 } }; - - Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT); - - // Only consider stores or EXP for EXP_CNT - Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore(); - - // LGKM may uses larger values - if (TSFlags & SIInstrFlags::LGKM_CNT) { - - if (TII->isSMRD(MI)) { - - if (MI.getNumOperands() != 0) { - assert(MI.getOperand(0).isReg() && - "First LGKM operand must be a register!"); - - // XXX - What if this is a write into a super register? - const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0); - unsigned Size = TRI->getRegSizeInBits(*RC); - Result.Named.LGKM = Size > 32 ? 2 : 1; - } else { - // s_dcache_inv etc. do not have a a destination register. Assume we - // want a wait on these. - // XXX - What is the right value? - Result.Named.LGKM = 1; - } - } else { - // DS - Result.Named.LGKM = 1; - } - - } else { - Result.Named.LGKM = 0; - } - - return Result; -} - -bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { - // Constants are always irrelevant - if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) - return false; - - // Defines are always relevant - if (Op.isDef()) - return true; - - // For exports all registers are relevant. - // TODO: Skip undef/disabled registers. - MachineInstr &MI = *Op.getParent(); - if (TII->isEXP(MI)) - return true; - - // For stores the stored value is also relevant - if (!MI.getDesc().mayStore()) - return false; - - // Check if this operand is the value being stored. - // Special case for DS/FLAT instructions, since the address - // operand comes before the value operand and it may have - // multiple data operands. - - if (TII->isDS(MI)) { - MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0); - if (Data0 && Op.isIdenticalTo(*Data0)) - return true; - - MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1); - return Data1 && Op.isIdenticalTo(*Data1); - } - - if (TII->isFLAT(MI)) { - MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata); - if (Data && Op.isIdenticalTo(*Data)) - return true; - } - - // NOTE: This assumes that the value operand is before the - // address operand, and that there is only one value operand. - for (MachineInstr::mop_iterator I = MI.operands_begin(), - E = MI.operands_end(); I != E; ++I) { - - if (I->isReg() && I->isUse()) - return Op.isIdenticalTo(*I); - } - - return false; -} - -RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC, - const MachineOperand &Reg) const { - unsigned Size = TRI->getRegSizeInBits(*RC); - assert(Size >= 32); - - RegInterval Result; - Result.first = TRI->getEncodingValue(Reg.getReg()); - Result.second = Result.first + Size / 32; - - return Result; -} - -void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const Counters &Increment) { - // Get the hardware counter increments and sum them up - Counters Limit = ZeroCounts; - unsigned Sum = 0; - - if (TII->mayAccessFlatAddressSpace(*I)) - IsFlatOutstanding = true; - - for (unsigned i = 0; i < 3; ++i) { - LastIssued.Array[i] += Increment.Array[i]; - if (Increment.Array[i]) - Limit.Array[i] = LastIssued.Array[i]; - Sum += Increment.Array[i]; - } - - // If we don't increase anything then that's it - if (Sum == 0) { - LastOpcodeType = OTHER; - return; - } - - if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { - // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM - // or SMEM clause, respectively. - // - // The temporary workaround is to break the clauses with S_NOP. - // - // The proper solution would be to allocate registers such that all source - // and destination registers don't overlap, e.g. this is illegal: - // r0 = load r2 - // r2 = load r0 - if (LastOpcodeType == VMEM && Increment.Named.VM) { - // Insert a NOP to break the clause. - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)) - .addImm(0); - LastInstWritesM0 = false; - } - - if (TII->isSMRD(*I)) - LastOpcodeType = SMEM; - else if (Increment.Named.VM) - LastOpcodeType = VMEM; - } - - // Remember which export instructions we have seen - if (Increment.Named.EXP) { - ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2; - } - - for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { - MachineOperand &Op = I->getOperand(i); - if (!isOpRelevant(Op)) - continue; - - const TargetRegisterClass *RC = TII->getOpRegClass(*I, i); - RegInterval Interval = getRegInterval(RC, Op); - for (unsigned j = Interval.first; j < Interval.second; ++j) { - - // Remember which registers we define - if (Op.isDef()) - DefinedRegs[j] = Limit; - - // and which one we are using - if (Op.isUse()) - UsedRegs[j] = Limit; - } - } -} - -bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - const Counters &Required) { - // End of program? No need to wait on anything - // A function not returning void needs to wait, because other bytecode will - // be appended after it and we don't know what it will be. - if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid) - return false; - - // Figure out if the async instructions execute in order - bool Ordered[3]; - - // VM_CNT is always ordered except when there are flat instructions, which - // can return out of order. - Ordered[0] = !IsFlatOutstanding; - - // EXP_CNT is unordered if we have both EXP & VM-writes - Ordered[1] = ExpInstrTypesSeen == 3; - - // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS - Ordered[2] = false; - - // The values we are going to put into the S_WAITCNT instruction - Counters Counts = HardwareLimits; - - // Do we really need to wait? - bool NeedWait = false; - - for (unsigned i = 0; i < 3; ++i) { - if (Required.Array[i] <= WaitedOn.Array[i]) - continue; - - NeedWait = true; - - if (Ordered[i]) { - unsigned Value = LastIssued.Array[i] - Required.Array[i]; - - // Adjust the value to the real hardware possibilities. - Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]); - } else - Counts.Array[i] = 0; - - // Remember on what we have waited on. - WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; - } - - if (!NeedWait) - return false; - - // Reset EXP_CNT instruction types - if (Counts.Named.EXP == 0) - ExpInstrTypesSeen = 0; - - // Build the wait instruction - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm(AMDGPU::encodeWaitcnt(ISA, - Counts.Named.VM, - Counts.Named.EXP, - Counts.Named.LGKM)); - - LastOpcodeType = OTHER; - LastInstWritesM0 = false; - IsFlatOutstanding = false; - return true; -} - -/// \brief helper function for handleOperands -static void increaseCounters(Counters &Dst, const Counters &Src) { - for (unsigned i = 0; i < 3; ++i) - Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]); -} - -/// \brief check whether any of the counters is non-zero -static bool countersNonZero(const Counters &Counter) { - for (unsigned i = 0; i < 3; ++i) - if (Counter.Array[i]) - return true; - return false; -} - -void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) { - assert(I->getOpcode() == AMDGPU::S_WAITCNT); - - unsigned Imm = I->getOperand(0).getImm(); - Counters Counts, WaitOn; - - Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm); - Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm); - Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm); - - for (unsigned i = 0; i < 3; ++i) { - if (Counts.Array[i] <= LastIssued.Array[i]) - WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; - else - WaitOn.Array[i] = 0; - } - - increaseCounters(DelayedWaitOn, WaitOn); -} - -Counters SIInsertWaits::handleOperands(MachineInstr &MI) { - Counters Result = ZeroCounts; - - // For each register affected by this instruction increase the result - // sequence. - // - // TODO: We could probably just look at explicit operands if we removed VCC / - // EXEC from SMRD dest reg classes. - for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &Op = MI.getOperand(i); - if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) - continue; - - const TargetRegisterClass *RC = TII->getOpRegClass(MI, i); - RegInterval Interval = getRegInterval(RC, Op); - for (unsigned j = Interval.first; j < Interval.second; ++j) { - if (Op.isDef()) { - increaseCounters(Result, UsedRegs[j]); - increaseCounters(Result, DefinedRegs[j]); - } - - if (Op.isUse()) - increaseCounters(Result, DefinedRegs[j]); - } - } - - return Result; -} - -void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) { - if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) - return; - - // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG. - if (LastInstWritesM0 && (I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT)) { - BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0); - LastInstWritesM0 = false; - return; - } - - // Set whether this instruction sets M0 - LastInstWritesM0 = false; - - unsigned NumOperands = I->getNumOperands(); - for (unsigned i = 0; i < NumOperands; i++) { - const MachineOperand &Op = I->getOperand(i); - - if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0) - LastInstWritesM0 = true; - } -} - -/// Return true if \p MBB has one successor immediately following, and is its -/// only predecessor -static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) { - if (MBB.succ_size() != 1) - return false; - - const MachineBasicBlock *Succ = *MBB.succ_begin(); - return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ); -} - -// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" -// around other non-memory instructions. -bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { - bool Changes = false; - - ST = &MF.getSubtarget<SISubtarget>(); - TII = ST->getInstrInfo(); - TRI = &TII->getRegisterInfo(); - MRI = &MF.getRegInfo(); - ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits()); - const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - - HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA); - HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA); - HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA); - - WaitedOn = ZeroCounts; - DelayedWaitOn = ZeroCounts; - LastIssued = ZeroCounts; - LastOpcodeType = OTHER; - LastInstWritesM0 = false; - IsFlatOutstanding = false; - ReturnsVoid = MFI->returnsVoid(); - - memset(&UsedRegs, 0, sizeof(UsedRegs)); - memset(&DefinedRegs, 0, sizeof(DefinedRegs)); - - SmallVector<MachineInstr *, 4> RemoveMI; - SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; - - bool HaveScalarStores = false; - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - MachineBasicBlock &MBB = *BI; - - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - if (!HaveScalarStores && TII->isScalarStore(*I)) - HaveScalarStores = true; - - if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) { - // There is a hardware bug on CI/SI where SMRD instruction may corrupt - // vccz bit, so when we detect that an instruction may read from a - // corrupt vccz bit, we need to: - // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to - // complete. - // 2. Restore the correct value of vccz by writing the current value - // of vcc back to vcc. - - if (TII->isSMRD(I->getOpcode())) { - VCCZCorrupt = true; - } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) { - // FIXME: We only care about SMRD instructions here, not LDS or GDS. - // Whenever we store a value in vcc, the correct value of vccz is - // restored. - VCCZCorrupt = false; - } - - // Check if we need to apply the bug work-around - if (VCCZCorrupt && readsVCCZ(*I)) { - DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n'); - - // Wait on everything, not just LGKM. vccz reads usually come from - // terminators, and we always wait on everything at the end of the - // block, so if we only wait on LGKM here, we might end up with - // another s_waitcnt inserted right after this if there are non-LGKM - // instructions still outstanding. - insertWait(MBB, I, LastIssued); - - // Restore the vccz bit. Any time a value is written to vcc, the vcc - // bit is updated, so we can restore the bit by reading the value of - // vcc and then writing it back to the register. - BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), - AMDGPU::VCC) - .addReg(AMDGPU::VCC); - } - } - - // Record pre-existing, explicitly requested waits - if (I->getOpcode() == AMDGPU::S_WAITCNT) { - handleExistingWait(*I); - RemoveMI.push_back(&*I); - continue; - } - - Counters Required; - - // Wait for everything before a barrier. - // - // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, - // but we also want to wait for any other outstanding transfers before - // signalling other hardware blocks - if ((I->getOpcode() == AMDGPU::S_BARRIER && - !ST->hasAutoWaitcntBeforeBarrier()) || - I->getOpcode() == AMDGPU::S_SENDMSG || - I->getOpcode() == AMDGPU::S_SENDMSGHALT) - Required = LastIssued; - else - Required = handleOperands(*I); - - Counters Increment = getHwCounts(*I); - - if (countersNonZero(Required) || countersNonZero(Increment)) - increaseCounters(Required, DelayedWaitOn); - - Changes |= insertWait(MBB, I, Required); - - pushInstruction(MBB, I, Increment); - handleSendMsg(MBB, I); - - if (I->getOpcode() == AMDGPU::S_ENDPGM || - I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) - EndPgmBlocks.push_back(&MBB); - } - - // Wait for everything at the end of the MBB. If there is only one - // successor, we can defer this until the uses there. - if (!hasTrivialSuccessor(MBB)) - Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); - } - - if (HaveScalarStores) { - // If scalar writes are used, the cache must be flushed or else the next - // wave to reuse the same scratch memory can be clobbered. - // - // Insert s_dcache_wb at wave termination points if there were any scalar - // stores, and only if the cache hasn't already been flushed. This could be - // improved by looking across blocks for flushes in postdominating blocks - // from the stores but an explicitly requested flush is probably very rare. - for (MachineBasicBlock *MBB : EndPgmBlocks) { - bool SeenDCacheWB = false; - - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); - I != E; ++I) { - if (I->getOpcode() == AMDGPU::S_DCACHE_WB) - SeenDCacheWB = true; - else if (TII->isScalarStore(*I)) - SeenDCacheWB = false; - - // FIXME: It would be better to insert this before a waitcnt if any. - if ((I->getOpcode() == AMDGPU::S_ENDPGM || - I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) { - Changes = true; - BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); - } - } - } - } - - for (MachineInstr *I : RemoveMI) - I->eraseFromParent(); - - if (!MFI->isEntryFunction()) { - // Wait for any outstanding memory operations that the input registers may - // depend on. We can't track them and it's better to to the wait after the - // costly call sequence. - - // TODO: Could insert earlier and schedule more liberally with operations - // that only use caller preserved registers. - MachineBasicBlock &EntryBB = MF.front(); - BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm(0); - - Changes = true; - } - - return Changes; -} diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td index 25917cc06e6a..b73d30940fc3 100644 --- a/lib/Target/AMDGPU/SIInstrFormats.td +++ b/lib/Target/AMDGPU/SIInstrFormats.td @@ -12,16 +12,16 @@ //===----------------------------------------------------------------------===// def isGCN : Predicate<"Subtarget->getGeneration() " - ">= SISubtarget::SOUTHERN_ISLANDS">, + ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">, AssemblerPredicate<"FeatureGCN">; def isSI : Predicate<"Subtarget->getGeneration() " - "== SISubtarget::SOUTHERN_ISLANDS">, + "== AMDGPUSubtarget::SOUTHERN_ISLANDS">, AssemblerPredicate<"FeatureSouthernIslands">; class InstSI <dag outs, dag ins, string asm = "", list<dag> pattern = []> : - AMDGPUInst<outs, ins, asm, pattern>, PredicateControl { + AMDGPUInst<outs, ins, asm, pattern>, GCNPredicateControl { let SubtargetPredicate = isGCN; // Low bits - basic encoding information. @@ -118,6 +118,9 @@ class InstSI <dag outs, dag ins, string asm = "", // This bit indicates that this is a packed VOP3P instruction field bit IsPacked = 0; + // This bit indicates that this is a D16 buffer instruction. + field bit D16Buf = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -173,6 +176,8 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{49} = IsPacked; + let TSFlags{50} = D16Buf; + let SchedRW = [Write32Bit]; field bits<1> DisableSIDecoder = 0; @@ -181,6 +186,9 @@ class InstSI <dag outs, dag ins, string asm = "", let isAsmParserOnly = !if(!eq(DisableDecoder{0}, {0}), 0, 1); let AsmVariantName = AMDGPUAsmVariants.Default; + + // Avoid changing source registers in a way that violates constant bus read limitations. + let hasExtraSrcRegAllocReq = !if(VOP1,1,!if(VOP2,1,!if(VOP3,1,!if(VOPC,1,!if(SDWA,1, !if(VALU,1,0)))))); } class PseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = ""> @@ -247,6 +255,7 @@ class MIMGe <bits<7> op> : Enc64 { bits<1> tfe; bits<1> lwe; bits<1> slc; + bit d16; bits<8> vaddr; bits<7> srsrc; bits<7> ssamp; @@ -265,6 +274,7 @@ class MIMGe <bits<7> op> : Enc64 { let Inst{47-40} = vdata; let Inst{52-48} = srsrc{6-2}; let Inst{57-53} = ssamp{6-2}; + let Inst{63} = d16; } class EXPe : Enc64 { @@ -309,6 +319,7 @@ class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> : let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; + let VALU = 1; } class EXPCommon<dag outs, dag ins, string asm, list<dag> pattern> : @@ -323,15 +334,3 @@ class EXPCommon<dag outs, dag ins, string asm, list<dag> pattern> : } } // End Uses = [EXEC] - -class MIMG <dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern> { - - let VM_CNT = 1; - let EXP_CNT = 1; - let MIMG = 1; - let Uses = [EXEC]; - - let UseNamedOperandTable = 1; - let hasSideEffects = 0; // XXX ???? -} diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 61967605432e..6c85c92454c3 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -8,17 +8,19 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief SI Implementation of TargetInstrInfo. +/// SI Implementation of TargetInstrInfo. // //===----------------------------------------------------------------------===// #include "SIInstrInfo.h" #include "AMDGPU.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "GCNHazardRecognizer.h" #include "SIDefines.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" @@ -37,7 +39,6 @@ #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" @@ -53,6 +54,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" #include <cassert> @@ -62,6 +64,19 @@ using namespace llvm; +#define GET_INSTRINFO_CTOR_DTOR +#include "AMDGPUGenInstrInfo.inc" + +namespace llvm { +namespace AMDGPU { +#define GET_D16ImageDimIntrinsics_IMPL +#define GET_ImageDimIntrinsicTable_IMPL +#define GET_RsrcIntrinsics_IMPL +#include "AMDGPUGenSearchableTables.inc" +} +} + + // Must be at least 4 to be able to branch over minimum unconditional branch // code. This is only for making it possible to write reasonably small tests for // long branches. @@ -69,8 +84,9 @@ static cl::opt<unsigned> BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)")); -SIInstrInfo::SIInstrInfo(const SISubtarget &ST) - : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {} +SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) + : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), + RI(ST), ST(ST) {} //===----------------------------------------------------------------------===// // TargetInstrInfo callbacks @@ -89,7 +105,7 @@ static SDValue findChainOperand(SDNode *Load) { return LastOp; } -/// \brief Returns true if both nodes have the same value for the given +/// Returns true if both nodes have the same value for the given /// operand \p Op, or if both nodes do not have this operand. static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { unsigned Opc0 = N0->getMachineOpcode(); @@ -437,6 +453,28 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; } +// FIXME: This behaves strangely. If, for example, you have 32 load + stores, +// the first 16 loads will be interleaved with the stores, and the next 16 will +// be clustered as expected. It should really split into 2 16 store batches. +// +// Loads are clustered until this returns false, rather than trying to schedule +// groups of stores. This also means we have to deal with saying different +// address space loads should be clustered, and ones which might cause bank +// conflicts. +// +// This might be deprecated so it might not be worth that much effort to fix. +bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, + int64_t Offset0, int64_t Offset1, + unsigned NumLoads) const { + assert(Offset1 > Offset0 && + "Second offset should be larger than first offset!"); + // If we have less than 16 loads in a row, and the offsets are within 64 + // bytes, then schedule together. + + // A cacheline is 64 bytes (for global memory). + return (NumLoads <= 16 && (Offset1 - Offset0) < 64); +} + static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned DestReg, @@ -827,10 +865,6 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineFrameInfo &FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); - assert(SrcReg != MFI->getStackPtrOffsetReg() && - SrcReg != MFI->getFrameOffsetReg() && - SrcReg != MFI->getScratchWaveOffsetReg()); - unsigned Size = FrameInfo.getObjectSize(FrameIndex); unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); MachinePointerInfo PtrInfo @@ -864,7 +898,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, // needing them, and need to ensure that the reserved registers are // correctly handled. - FrameInfo.setStackID(FrameIndex, 1); + FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL); if (ST.hasScalarStores()) { // m0 is used for offset to scalar stores if used to spill. Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); @@ -960,7 +994,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); } - FrameInfo.setStackID(FrameIndex, 1); + FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL); MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) @@ -1001,7 +1035,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( unsigned FrameOffset, unsigned Size) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); DebugLoc DL = MBB.findDebugLoc(MI); unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); unsigned WavefrontSize = ST.getWavefrontSize(); @@ -1137,7 +1171,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MBB.findDebugLoc(MI); switch (MI.getOpcode()) { - default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); + default: return TargetInstrInfo::expandPostRAPseudo(MI); case AMDGPU::S_MOV_B64_term: // This is only a terminator to get the correct spill code placement during // register allocation. @@ -1269,6 +1303,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(AMDGPU::S_MOV_B64)); break; } + case TargetOpcode::BUNDLE: { + if (!MI.mayLoad()) + return false; + + // If it is a load it must be a memory clause + for (MachineBasicBlock::instr_iterator I = MI.getIterator(); + I->isBundledWithSucc(); ++I) { + I->unbundleFromSucc(); + for (MachineOperand &MO : I->operands()) + if (MO.isReg()) + MO.setIsInternalRead(false); + } + + MI.eraseFromParent(); + break; + } } return true; } @@ -1887,16 +1937,16 @@ unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( switch(Kind) { case PseudoSourceValue::Stack: case PseudoSourceValue::FixedStack: - return AMDGPUASI.PRIVATE_ADDRESS; + return ST.getAMDGPUAS().PRIVATE_ADDRESS; case PseudoSourceValue::ConstantPool: case PseudoSourceValue::GOT: case PseudoSourceValue::JumpTable: case PseudoSourceValue::GlobalValueCallEntry: case PseudoSourceValue::ExternalSymbolCallEntry: case PseudoSourceValue::TargetCustom: - return AMDGPUASI.CONSTANT_ADDRESS; + return ST.getAMDGPUAS().CONSTANT_ADDRESS; } - return AMDGPUASI.FLAT_ADDRESS; + return ST.getAMDGPUAS().FLAT_ADDRESS; } static void removeModOperands(MachineInstr &MI) { @@ -2165,20 +2215,24 @@ static int64_t getFoldableImm(const MachineOperand* MO) { MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, MachineInstr &MI, LiveVariables *LV) const { + unsigned Opc = MI.getOpcode(); bool IsF16 = false; + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64; - switch (MI.getOpcode()) { + switch (Opc) { default: return nullptr; case AMDGPU::V_MAC_F16_e64: IsF16 = true; LLVM_FALLTHROUGH; case AMDGPU::V_MAC_F32_e64: + case AMDGPU::V_FMAC_F32_e64: break; case AMDGPU::V_MAC_F16_e32: IsF16 = true; LLVM_FALLTHROUGH; - case AMDGPU::V_MAC_F32_e32: { + case AMDGPU::V_MAC_F32_e32: + case AMDGPU::V_FMAC_F32_e32: { int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); const MachineOperand *Src0 = &MI.getOperand(Src0Idx); @@ -2203,7 +2257,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); - if (!Src0Mods && !Src1Mods && !Clamp && !Omod && + if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod && // If we have an SGPR input, we will violate the constant bus restriction. (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { if (auto Imm = getFoldableImm(Src2)) { @@ -2234,8 +2288,10 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, } } - return BuildMI(*MBB, MI, MI.getDebugLoc(), - get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32)) + assert((!IsFMA || !IsF16) && "fmac only expected with f32"); + unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 : + (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32); + return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc)) .add(*Dst) .addImm(Src0Mods ? Src0Mods->getImm() : 0) .add(*Src0) @@ -2339,6 +2395,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, } case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { + if (isUInt<16>(Imm)) { + int16_t Trunc = static_cast<int16_t>(Imm); + return ST.has16BitInsts() && + AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); + } + if (!(Imm & 0xffff)) { + return ST.has16BitInsts() && + AMDGPU::isInlinableLiteral16(Imm >> 16, ST.hasInv2PiInlineImm()); + } uint32_t Trunc = static_cast<uint32_t>(Imm); return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); } @@ -2711,14 +2776,16 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } - // Verify VOP* - if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) { + // Verify VOP*. Ignore multiple sgpr operands on writelane. + if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 + && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { // Only look at the true operands. Only a real operand can use the constant // bus, and we don't want to check pseudo-operands like the source modifier // flags. const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; unsigned ConstantBusCount = 0; + unsigned LiteralCount = 0; if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) ++ConstantBusCount; @@ -2738,6 +2805,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, SGPRUsed = MO.getReg(); } else { ++ConstantBusCount; + ++LiteralCount; } } } @@ -2745,6 +2813,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, ErrInfo = "VOP* instruction uses the constant bus more than once"; return false; } + + if (isVOP3(MI) && LiteralCount) { + ErrInfo = "VOP3 instruction uses literal"; + return false; + } } // Verify misc. restrictions on specific instructions. @@ -2842,7 +2915,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } - if (isFLAT(MI) && !MF->getSubtarget<SISubtarget>().hasFlatInstOffsets()) { + if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) { const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); if (Offset->getImm() != 0) { ErrInfo = "subtarget does not support offsets in flat instructions"; @@ -2850,6 +2923,22 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl); + if (DppCt) { + using namespace AMDGPU::DPP; + + unsigned DC = DppCt->getImm(); + if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 || + DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST || + (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) || + (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) || + (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) || + (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST)) { + ErrInfo = "Invalid dpp_ctrl value"; + return false; + } + } + return true; } @@ -3147,6 +3236,29 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, legalizeOpWithMove(MI, Src0Idx); } + // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for + // both the value to write (src0) and lane select (src1). Fix up non-SGPR + // src0/src1 with V_READFIRSTLANE. + if (Opc == AMDGPU::V_WRITELANE_B32) { + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + MachineOperand &Src0 = MI.getOperand(Src0Idx); + const DebugLoc &DL = MI.getDebugLoc(); + if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) { + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) + .add(Src0); + Src0.ChangeToRegister(Reg, false); + } + if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) { + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + const DebugLoc &DL = MI.getDebugLoc(); + BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg) + .add(Src1); + Src1.ChangeToRegister(Reg, false); + } + return; + } + // VOP2 src0 instructions support all operand types, so we don't need to check // their legality. If src1 is already legal, we don't need to do anything. if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) @@ -3261,6 +3373,13 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, unsigned DstReg = MRI.createVirtualRegister(SRC); unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32; + if (SubRegs == 1) { + BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), + get(AMDGPU::V_READFIRSTLANE_B32), DstReg) + .addReg(SrcReg); + return DstReg; + } + SmallVector<unsigned, 8> SRegs; for (unsigned i = 0; i < SubRegs; ++i) { unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); @@ -3438,6 +3557,14 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { return; } + // Legalize SI_INIT_M0 + if (MI.getOpcode() == AMDGPU::SI_INIT_M0) { + MachineOperand &Src = MI.getOperand(0); + if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg()))) + Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); + return; + } + // Legalize MIMG and MUBUF/MTBUF for shaders. // // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via @@ -3539,8 +3666,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { } else { // This instructions is the _OFFSET variant, so we need to convert it to // ADDR64. - assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration() - < SISubtarget::VOLCANIC_ISLANDS && + assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration() + < AMDGPUSubtarget::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here"); MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); @@ -3676,37 +3803,37 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { continue; case AMDGPU::S_LSHL_B32: - if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHLREV_B32_e64; swapOperands(Inst); } break; case AMDGPU::S_ASHR_I32: - if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_ASHRREV_I32_e64; swapOperands(Inst); } break; case AMDGPU::S_LSHR_B32: - if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHRREV_B32_e64; swapOperands(Inst); } break; case AMDGPU::S_LSHL_B64: - if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHLREV_B64; swapOperands(Inst); } break; case AMDGPU::S_ASHR_I64: - if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_ASHRREV_I64; swapOperands(Inst); } break; case AMDGPU::S_LSHR_B64: - if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHRREV_B64; swapOperands(Inst); } @@ -3756,39 +3883,49 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { // FIXME: This isn't safe because the addressing mode doesn't work // correctly if vaddr is negative. // - // FIXME: Handle v_add_u32 and VOP3 form. Also don't rely on immediate - // being in src0. - // // FIXME: Should probably be done somewhere else, maybe SIFoldOperands. // // See if we can extract an immediate offset by recognizing one of these: // V_ADD_I32_e32 dst, imm, src1 // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1 // V_ADD will be removed by "Remove dead machine instructions". - if (Add && Add->getOpcode() == AMDGPU::V_ADD_I32_e32) { - const MachineOperand *Src = - getNamedOperand(*Add, AMDGPU::OpName::src0); - - if (Src->isReg()) { - auto Mov = MRI.getUniqueVRegDef(Src->getReg()); - if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32) - Src = &Mov->getOperand(1); - } - - if (Src) { - if (Src->isImm()) - Offset = Src->getImm(); - else if (Src->isCImm()) - Offset = Src->getCImm()->getZExtValue(); - } + if (Add && + (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 || + Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) { + static const unsigned SrcNames[2] = { + AMDGPU::OpName::src0, + AMDGPU::OpName::src1, + }; + + // Find a literal offset in one of source operands. + for (int i = 0; i < 2; i++) { + const MachineOperand *Src = + getNamedOperand(*Add, SrcNames[i]); + + if (Src->isReg()) { + auto Mov = MRI.getUniqueVRegDef(Src->getReg()); + if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32) + Src = &Mov->getOperand(1); + } + + if (Src) { + if (Src->isImm()) + Offset = Src->getImm(); + else if (Src->isCImm()) + Offset = Src->getCImm()->getZExtValue(); + } + + if (Offset && isLegalMUBUFImmOffset(Offset)) { + VAddr = getNamedOperand(*Add, SrcNames[!i]); + break; + } - if (Offset && isLegalMUBUFImmOffset(Offset)) - VAddr = getNamedOperand(*Add, AMDGPU::OpName::src1); - else Offset = 0; + } } - BuildMI(*MBB, Inst, Inst.getDebugLoc(), + MachineInstr *NewInstr = + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst) .add(*VAddr) // vaddr .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc @@ -3797,12 +3934,17 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm()) .addImm(0) // slc .addImm(0) // tfe - .setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end()); + .setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end()) + .getInstr(); MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(), VDst); addUsersToMoveToVALUWorklist(VDst, MRI, Worklist); Inst.eraseFromParent(); + + // Legalize all operands other than the offset. Notably, convert the srsrc + // into SGPRs using v_readfirstlane if needed. + legalizeOperands(*NewInstr); continue; } } @@ -3884,6 +4026,13 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); MRI.clearKillFlags(Inst.getOperand(1).getReg()); Inst.getOperand(0).setReg(DstReg); + + // Make sure we don't leave around a dead VGPR->SGPR copy. Normally + // these are deleted later, but at -O0 it would leave a suspicious + // looking illegal copy of an undef register. + for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) + Inst.RemoveOperand(I); + Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); continue; } @@ -3975,17 +4124,23 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); - unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor) - .add(Src0) - .add(Src1); + unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + if (ST.hasDLInsts()) { + BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) + .add(Src0) + .add(Src1); + } else { + unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor) + .add(Src0) + .add(Src1); - unsigned Not = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), Not) - .addReg(Xor); + BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest) + .addReg(Xor); + } - MRI.replaceRegWith(Dest.getReg(), Not); - addUsersToMoveToVALUWorklist(Not, MRI, Worklist); + MRI.replaceRegWith(Dest.getReg(), NewDest); + addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); } void SIInstrInfo::splitScalar64BitUnaryOp( @@ -4478,12 +4633,12 @@ uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; if (ST.isAmdHsaOS()) { // Set ATC = 1. GFX9 doesn't have this bit. - if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) + if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) RsrcDataFormat |= (1ULL << 56); // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. // BTW, it disables TC L2 and therefore decreases performance. - if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS) + if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) RsrcDataFormat |= (2ULL << 59); } @@ -4496,7 +4651,7 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const { 0xffffffff; // Size; // GFX9 doesn't have ELEMENT_SIZE. - if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) { uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; } @@ -4506,7 +4661,7 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const { // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. // Clear them unless we want a huge stride. - if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; return Rsrc23; @@ -4531,7 +4686,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, return AMDGPU::NoRegister; assert(!MI.memoperands_empty() && - (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS); + (*MI.memoperands_begin())->getAddrSpace() == ST.getAMDGPUAS().PRIVATE_ADDRESS); FrameIndex = Addr->getIndex(); return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); @@ -4598,12 +4753,12 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { if (DescSize != 0 && DescSize != 4) return DescSize; + if (isFixedSize(MI)) + return DescSize; + // 4-byte instructions may have a 32-bit literal encoded after them. Check // operands that coud ever be literals. if (isVALU(MI) || isSALU(MI)) { - if (isFixedSize(MI)) - return DescSize; - int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); if (Src0Idx == -1) return 4; // No operands. @@ -4650,7 +4805,7 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { return true; for (const MachineMemOperand *MMO : MI.memoperands()) { - if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS) + if (MMO->getAddrSpace() == ST.getAMDGPUAS().FLAT_ADDRESS) return true; } return false; @@ -4817,3 +4972,70 @@ const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) con llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO"); } } + +bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { + if (!isSMRD(MI)) + return false; + + // Check that it is using a buffer resource. + int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase); + if (Idx == -1) // e.g. s_memtime + return false; + + const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; + return RCID == AMDGPU::SReg_128RegClassID; +} + +// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td +enum SIEncodingFamily { + SI = 0, + VI = 1, + SDWA = 2, + SDWA9 = 3, + GFX80 = 4, + GFX9 = 5 +}; + +static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { + switch (ST.getGeneration()) { + default: + break; + case AMDGPUSubtarget::SOUTHERN_ISLANDS: + case AMDGPUSubtarget::SEA_ISLANDS: + return SIEncodingFamily::SI; + case AMDGPUSubtarget::VOLCANIC_ISLANDS: + case AMDGPUSubtarget::GFX9: + return SIEncodingFamily::VI; + } + llvm_unreachable("Unknown subtarget generation!"); +} + +int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { + SIEncodingFamily Gen = subtargetEncodingFamily(ST); + + if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && + ST.getGeneration() >= AMDGPUSubtarget::GFX9) + Gen = SIEncodingFamily::GFX9; + + if (get(Opcode).TSFlags & SIInstrFlags::SDWA) + Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9 + : SIEncodingFamily::SDWA; + // Adjust the encoding family to GFX80 for D16 buffer instructions when the + // subtarget has UnpackedD16VMem feature. + // TODO: remove this when we discard GFX80 encoding. + if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf)) + Gen = SIEncodingFamily::GFX80; + + int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); + + // -1 means that Opcode is already a native instruction. + if (MCOp == -1) + return Opcode; + + // (uint16_t)-1 means that Opcode is a pseudo instruction that has + // no encoding in the given subtarget generation. + if (MCOp == (uint16_t)-1) + return -1; + + return MCOp; +} diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 24ee843e6ade..0a735257d34e 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Interface definition for SIInstrInfo. +/// Interface definition for SIInstrInfo. // //===----------------------------------------------------------------------===// @@ -31,20 +31,23 @@ #include <cassert> #include <cstdint> +#define GET_INSTRINFO_HEADER +#include "AMDGPUGenInstrInfo.inc" + namespace llvm { class APInt; class MachineRegisterInfo; class RegScavenger; -class SISubtarget; +class GCNSubtarget; class TargetRegisterClass; -class SIInstrInfo final : public AMDGPUInstrInfo { +class SIInstrInfo final : public AMDGPUGenInstrInfo { private: const SIRegisterInfo RI; - const SISubtarget &ST; + const GCNSubtarget &ST; - // The the inverse predicate should have the negative value. + // The inverse predicate should have the negative value. enum BranchPredicate { INVALID_BR = 0, SCC_TRUE = 1, @@ -144,7 +147,7 @@ public: MO_REL32_HI = 5 }; - explicit SIInstrInfo(const SISubtarget &ST); + explicit SIInstrInfo(const GCNSubtarget &ST); const SIRegisterInfo &getRegisterInfo() const { return RI; @@ -163,7 +166,10 @@ public: bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1, MachineInstr &SecondLdSt, unsigned BaseReg2, - unsigned NumLoads) const final; + unsigned NumLoads) const override; + + bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, + int64_t Offset1, unsigned NumLoads) const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, @@ -203,7 +209,7 @@ public: bool expandPostRAPseudo(MachineInstr &MI) const override; - // \brief Returns an opcode that can be used to move a value to a \p DstRC + // Returns an opcode that can be used to move a value to a \p DstRC // register. If there is no hardware instruction that can store to \p // DstRC, then AMDGPU::COPY is returned. unsigned getMovOpcode(const TargetRegisterClass *DstRC) const; @@ -419,18 +425,7 @@ public: return get(Opcode).TSFlags & SIInstrFlags::SMRD; } - bool isBufferSMRD(const MachineInstr &MI) const { - if (!isSMRD(MI)) - return false; - - // Check that it is using a buffer resource. - int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase); - if (Idx == -1) // e.g. s_memtime - return false; - - const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; - return RCID == AMDGPU::SReg_128RegClassID; - } + bool isBufferSMRD(const MachineInstr &MI) const; static bool isDS(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::DS; @@ -674,16 +669,16 @@ public: bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const; - /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding. + /// Return true if this 64-bit VALU instruction has a 32-bit encoding. /// This function will return false if you pass it a 32-bit instruction. bool hasVALU32BitEncoding(unsigned Opcode) const; - /// \brief Returns true if this operand uses the constant bus. + /// Returns true if this operand uses the constant bus. bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const; - /// \brief Return true if this instruction has any modifiers. + /// Return true if this instruction has any modifiers. /// e.g. src[012]_mod, omod, clamp. bool hasModifiers(unsigned Opcode) const; @@ -696,7 +691,7 @@ public: unsigned getVALUOp(const MachineInstr &MI) const; - /// \brief Return the correct register class for \p OpNo. For target-specific + /// Return the correct register class for \p OpNo. For target-specific /// instructions, this will return the register class that has been defined /// in tablegen. For generic instructions, like REG_SEQUENCE it will return /// the register class of its machine operand. @@ -704,7 +699,7 @@ public: const TargetRegisterClass *getOpRegClass(const MachineInstr &MI, unsigned OpNo) const; - /// \brief Return the size in bytes of the operand OpNo on the given + /// Return the size in bytes of the operand OpNo on the given // instruction opcode. unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const { const MCOperandInfo &OpInfo = get(Opcode).OpInfo[OpNo]; @@ -718,7 +713,7 @@ public: return RI.getRegSizeInBits(*RI.getRegClass(OpInfo.RegClass)) / 8; } - /// \brief This form should usually be preferred since it handles operands + /// This form should usually be preferred since it handles operands /// with unknown register classes. unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const { return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8; @@ -728,7 +723,7 @@ public: /// to read a VGPR. bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const; - /// \brief Legalize the \p OpIndex operand of this instruction by inserting + /// Legalize the \p OpIndex operand of this instruction by inserting /// a MOV. For example: /// ADD_I32_e32 VGPR0, 15 /// to @@ -739,29 +734,29 @@ public: /// instead of MOV. void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const; - /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand + /// Check if \p MO is a legal operand if it was the \p OpIdx Operand /// for \p MI. bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO = nullptr) const; - /// \brief Check if \p MO would be a valid operand for the given operand + /// Check if \p MO would be a valid operand for the given operand /// definition \p OpInfo. Note this does not attempt to validate constant bus /// restrictions (e.g. literal constant usage). bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const; - /// \brief Check if \p MO (a register operand) is a legal register for the + /// Check if \p MO (a register operand) is a legal register for the /// given operand description. bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const; - /// \brief Legalize operands in \p MI by either commuting it or inserting a + /// Legalize operands in \p MI by either commuting it or inserting a /// copy of src1. void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const; - /// \brief Fix operands in \p MI to satisfy constant bus requirements. + /// Fix operands in \p MI to satisfy constant bus requirements. void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const; /// Copy a value from a VGPR (\p SrcReg) to SGPR. This function can only @@ -779,11 +774,11 @@ public: MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const; - /// \brief Legalize all operands in this instruction. This function may + /// Legalize all operands in this instruction. This function may /// create new instruction and insert them before \p MI. void legalizeOperands(MachineInstr &MI) const; - /// \brief Replace this instruction's opcode with the equivalent VALU + /// Replace this instruction's opcode with the equivalent VALU /// opcode. This function will also move the users of \p MI to the /// VALU if necessary. void moveToVALU(MachineInstr &MI) const; @@ -795,11 +790,11 @@ public: MachineBasicBlock::iterator MI) const override; void insertReturn(MachineBasicBlock &MBB) const; - /// \brief Return the number of wait states that result from executing this + /// Return the number of wait states that result from executing this /// instruction. unsigned getNumWaitStates(const MachineInstr &MI) const; - /// \brief Returns the operand named \p Op. If \p MI does not have an + /// Returns the operand named \p Op. If \p MI does not have an /// operand named \c Op, this function returns nullptr. LLVM_READONLY MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const; @@ -822,7 +817,7 @@ public: bool isLowLatencyInstruction(const MachineInstr &MI) const; bool isHighLatencyInstruction(const MachineInstr &MI) const; - /// \brief Return the descriptor of the target-specific machine instruction + /// Return the descriptor of the target-specific machine instruction /// that corresponds to the specified pseudo or native opcode. const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const { return get(pseudoToMCOpcode(Opcode)); @@ -867,7 +862,7 @@ public: bool isBasicBlockPrologue(const MachineInstr &MI) const override; - /// \brief Return a partially built integer add instruction without carry. + /// Return a partially built integer add instruction without carry. /// Caller must add source operands. /// For pre-GFX9 it will generate unused carry destination operand. /// TODO: After GFX9 it should return a no-carry operation. @@ -882,6 +877,12 @@ public: static bool isLegalMUBUFImmOffset(unsigned Imm) { return isUInt<12>(Imm); } + + /// \brief Return a target-specific opcode if Opcode is a pseudo instruction. + /// Return -1 if the target-specific opcode for the pseudo instruction does + /// not exist. If Opcode is not a pseudo instruction, this is identity. + int pseudoToMCOpcode(int Opcode) const; + }; namespace AMDGPU { @@ -908,6 +909,9 @@ namespace AMDGPU { int getAddr64Inst(uint16_t Opcode); LLVM_READONLY + int getMUBUFNoLdsInst(uint16_t Opcode); + + LLVM_READONLY int getAtomicRetOp(uint16_t Opcode); LLVM_READONLY diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index fc2d35d873aa..8fa37aa83dae 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -7,16 +7,21 @@ // //===----------------------------------------------------------------------===// def isCI : Predicate<"Subtarget->getGeneration() " - ">= SISubtarget::SEA_ISLANDS">; + ">= AMDGPUSubtarget::SEA_ISLANDS">; def isCIOnly : Predicate<"Subtarget->getGeneration() ==" - "SISubtarget::SEA_ISLANDS">, + "AMDGPUSubtarget::SEA_ISLANDS">, AssemblerPredicate <"FeatureSeaIslands">; def isVIOnly : Predicate<"Subtarget->getGeneration() ==" - "SISubtarget::VOLCANIC_ISLANDS">, + "AMDGPUSubtarget::VOLCANIC_ISLANDS">, AssemblerPredicate <"FeatureVolcanicIslands">; def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">; +class GCNPredicateControl : PredicateControl { + Predicate SIAssemblerPredicate = isSICI; + Predicate VIAssemblerPredicate = isVI; +} + // Execpt for the NONE field, this must be kept in sync with the // SIEncodingFamily enum in AMDGPUInstrInfo.cpp def SIEncodingFamily { @@ -25,13 +30,16 @@ def SIEncodingFamily { int VI = 1; int SDWA = 2; int SDWA9 = 3; - int GFX9 = 4; + int GFX80 = 4; + int GFX9 = 5; } //===----------------------------------------------------------------------===// // SI DAG Nodes //===----------------------------------------------------------------------===// +def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>; + def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT", SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>, [SDNPMayLoad, SDNPMemOperand] @@ -45,22 +53,41 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2, [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; -def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", - SDTypeProfile<1, 9, - [ // vdata - SDTCisVT<1, v4i32>, // rsrc - SDTCisVT<2, i32>, // vindex(VGPR) - SDTCisVT<3, i32>, // voffset(VGPR) - SDTCisVT<4, i32>, // soffset(SGPR) - SDTCisVT<5, i32>, // offset(imm) - SDTCisVT<6, i32>, // dfmt(imm) - SDTCisVT<7, i32>, // nfmt(imm) - SDTCisVT<8, i32>, // glc(imm) - SDTCisVT<9, i32> // slc(imm) - ]>, - [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] +def SDTAtomic2_f32 : SDTypeProfile<1, 2, [ + SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1> +]>; + +def SIatomic_fadd : SDNode<"AMDGPUISD::ATOMIC_LOAD_FADD", SDTAtomic2_f32, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] +>; + +def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; +def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] +>; + +def SDTbuffer_load : SDTypeProfile<1, 9, + [ // vdata + SDTCisVT<1, v4i32>, // rsrc + SDTCisVT<2, i32>, // vindex(VGPR) + SDTCisVT<3, i32>, // voffset(VGPR) + SDTCisVT<4, i32>, // soffset(SGPR) + SDTCisVT<5, i32>, // offset(imm) + SDTCisVT<6, i32>, // dfmt(imm) + SDTCisVT<7, i32>, // nfmt(imm) + SDTCisVT<8, i32>, // glc(imm) + SDTCisVT<9, i32> // slc(imm) + ]>; + +def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTbuffer_load, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; +def SItbuffer_load_d16 : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT_D16", + SDTbuffer_load, + [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; + def SDTtbuffer_store : SDTypeProfile<0, 10, [ // vdata SDTCisVT<1, v4i32>, // rsrc @@ -79,6 +106,9 @@ def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3", SDTtbuffer_store, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SItbuffer_store_d16 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_D16", + SDTtbuffer_store, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; def SDTBufferLoad : SDTypeProfile<1, 5, [ // vdata @@ -92,6 +122,9 @@ def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16", + SDTBufferLoad, + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; def SDTBufferStore : SDTypeProfile<0, 6, [ // vdata @@ -102,9 +135,13 @@ def SDTBufferStore : SDTypeProfile<0, 6, SDTCisVT<5, i1>]>; // slc def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore, - [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>; -def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", SDTBufferStore, - [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>; + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", + SDTBufferStore, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; +def SIbuffer_store_format_d16 : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT_D16", + SDTBufferStore, + [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; class SDBufferAtomic<string opcode> : SDNode <opcode, SDTypeProfile<1, 5, @@ -140,21 +177,41 @@ def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] >; -class SDSample<string opcode> : SDNode <opcode, - SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v8i32>, - SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]> ->; - -def SIsample : SDSample<"AMDGPUISD::SAMPLE">; -def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">; -def SIsampled : SDSample<"AMDGPUISD::SAMPLED">; -def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">; - def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET", SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]> >; //===----------------------------------------------------------------------===// +// ValueType helpers +//===----------------------------------------------------------------------===// + +// Returns 1 if the source arguments have modifiers, 0 if they do not. +// XXX - do f16 instructions? +class isFloatType<ValueType SrcVT> { + bit ret = + !if(!eq(SrcVT.Value, f16.Value), 1, + !if(!eq(SrcVT.Value, f32.Value), 1, + !if(!eq(SrcVT.Value, f64.Value), 1, + !if(!eq(SrcVT.Value, v2f16.Value), 1, + 0)))); +} + +class isIntType<ValueType SrcVT> { + bit ret = + !if(!eq(SrcVT.Value, i16.Value), 1, + !if(!eq(SrcVT.Value, i32.Value), 1, + !if(!eq(SrcVT.Value, i64.Value), 1, + 0))); +} + +class isPackedType<ValueType SrcVT> { + bit ret = + !if(!eq(SrcVT.Value, v2i16.Value), 1, + !if(!eq(SrcVT.Value, v2f16.Value), 1, 0) + ); +} + +//===----------------------------------------------------------------------===// // PatFrags for global memory operations //===----------------------------------------------------------------------===// @@ -163,6 +220,9 @@ defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>; def atomic_inc_local : local_binary_atomic_op<SIatomic_inc>; def atomic_dec_local : local_binary_atomic_op<SIatomic_dec>; +def atomic_load_fadd_local : local_binary_atomic_op<SIatomic_fadd>; +def atomic_load_fmin_local : local_binary_atomic_op<SIatomic_fmin>; +def atomic_load_fmax_local : local_binary_atomic_op<SIatomic_fmax>; //===----------------------------------------------------------------------===// // SDNodes PatFrags for loads/stores with a glue input. @@ -178,6 +238,10 @@ def AMDGPUld_glue : SDNode <"ISD::LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] >; +def AMDGPUatomic_ld_glue : SDNode <"ISD::ATOMIC_LOAD", SDTAtomicLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] +>; + def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr), [{ return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED; }]>; @@ -186,6 +250,18 @@ def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr), [{ return cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD; }]>; +def atomic_load_32_glue : PatFrag<(ops node:$ptr), + (AMDGPUatomic_ld_glue node:$ptr)> { + let IsAtomic = 1; + let MemoryVT = i32; +} + +def atomic_load_64_glue : PatFrag<(ops node:$ptr), + (AMDGPUatomic_ld_glue node:$ptr)> { + let IsAtomic = 1; + let MemoryVT = i64; +} + def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr), [{ return cast<LoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD; }]>; @@ -219,6 +295,9 @@ def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr), [{ def load_glue_align8 : Aligned8Bytes < (ops node:$ptr), (load_glue node:$ptr) >; +def load_glue_align16 : Aligned16Bytes < + (ops node:$ptr), (load_glue node:$ptr) +>; def load_local_m0 : LoadFrag<load_glue>, LocalAddress; @@ -227,12 +306,23 @@ def sextloadi16_local_m0 : LoadFrag<sextloadi16_glue>, LocalAddress; def az_extloadi8_local_m0 : LoadFrag<az_extloadi8_glue>, LocalAddress; def az_extloadi16_local_m0 : LoadFrag<az_extloadi16_glue>, LocalAddress; def load_align8_local_m0 : LoadFrag <load_glue_align8>, LocalAddress; +def load_align16_local_m0 : LoadFrag <load_glue_align16>, LocalAddress; +def atomic_load_32_local_m0 : LoadFrag<atomic_load_32_glue>, LocalAddress; +def atomic_load_64_local_m0 : LoadFrag<atomic_load_64_glue>, LocalAddress; def AMDGPUst_glue : SDNode <"ISD::STORE", SDTStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue] >; +def AMDGPUatomic_st_glue : SDNode <"ISD::ATOMIC_STORE", SDTAtomicStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue] +>; + +def atomic_store_glue : PatFrag<(ops node:$ptr, node:$val), + (AMDGPUatomic_st_glue node:$ptr, node:$val)> { +} + def unindexedstore_glue : PatFrag<(ops node:$val, node:$ptr), (AMDGPUst_glue node:$val, node:$ptr), [{ return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED; @@ -262,11 +352,17 @@ def store_glue_align8 : Aligned8Bytes < (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr) >; +def store_glue_align16 : Aligned16Bytes < + (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr) +>; + def store_local_m0 : StoreFrag<store_glue>, LocalAddress; def truncstorei8_local_m0 : StoreFrag<truncstorei8_glue>, LocalAddress; def truncstorei16_local_m0 : StoreFrag<truncstorei16_glue>, LocalAddress; +def atomic_store_local_m0 : StoreFrag<AMDGPUatomic_st_glue>, LocalAddress; def store_align8_local_m0 : StoreFrag<store_glue_align8>, LocalAddress; +def store_align16_local_m0 : StoreFrag<store_glue_align16>, LocalAddress; def si_setcc_uniform : PatFrag < (ops node:$lhs, node:$rhs, node:$cond), @@ -297,10 +393,11 @@ def lshl_rev : PatFrag < (shl $src0, $src1) >; -multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> { +multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0, + SDTypeProfile tc = SDTAtomic2> { def _glue : SDNode < - !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, SDTAtomic2, + !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, tc, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] >; @@ -319,6 +416,9 @@ defm atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">; defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; defm atomic_swap : SIAtomicM0Glue2 <"SWAP">; +defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 1, SDTAtomic2_f32>; +defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32>; +defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32>; def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] @@ -368,6 +468,12 @@ return CurDAG->getTargetConstant( N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64); }]>; +class bitextract_imm<int bitnum> : SDNodeXForm<imm, [{ + uint64_t Imm = N->getZExtValue(); + unsigned Bit = (Imm >> }] # bitnum # [{ ) & 1; + return CurDAG->getTargetConstant(Bit, SDLoc(N), MVT::i1); +}]>; + def SIMM16bit : PatLeaf <(imm), [{return isInt<16>(N->getSExtValue());}] >; @@ -381,7 +487,7 @@ class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{ }]>; class VGPRImm <dag frag> : PatLeaf<frag, [{ - if (Subtarget->getGeneration() < SISubtarget::SOUTHERN_ISLANDS) { + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { return false; } const SIRegisterInfo *SIRI = @@ -552,19 +658,18 @@ def ExpSrc3 : RegisterOperand<VGPR_32> { let ParserMatchClass = VReg32OrOffClass; } -class SDWASrc : RegisterOperand<VS_32> { +class SDWASrc<ValueType vt> : RegisterOperand<VS_32> { let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_SDWA_SRC"; + string Type = !if(isFloatType<vt>.ret, "FP", "INT"); + let OperandType = "OPERAND_REG_INLINE_C_"#Type#vt.Size; + let DecoderMethod = "decodeSDWASrc"#vt.Size; let EncoderMethod = "getSDWASrcEncoding"; } -def SDWASrc32 : SDWASrc { - let DecoderMethod = "decodeSDWASrc32"; -} - -def SDWASrc16 : SDWASrc { - let DecoderMethod = "decodeSDWASrc16"; -} +def SDWASrc_i32 : SDWASrc<i32>; +def SDWASrc_i16 : SDWASrc<i16>; +def SDWASrc_f32 : SDWASrc<f32>; +def SDWASrc_f16 : SDWASrc<f16>; def SDWAVopcDst : VOPDstOperand<SReg_64> { let OperandNamespace = "AMDGPU"; @@ -637,19 +742,20 @@ def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>; def highmod : NamedOperandBit<"High", NamedMatchClass<"High">>; def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>; -def slc : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>; -def tfe : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>; -def unorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>; -def da : NamedOperandBit<"DA", NamedMatchClass<"DA">>; -def r128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>; -def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>; +def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>; +def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>; +def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>; +def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>; +def R128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>; +def D16 : NamedOperandBit<"D16", NamedMatchClass<"D16">>; +def LWE : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>; def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>; def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>; def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>; def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>; -def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>; +def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>; def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>; def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>; @@ -747,16 +853,23 @@ class OpSelModsMatchClass : AsmOperandClass { def IntOpSelModsMatchClass : OpSelModsMatchClass; def IntOpSelMods : InputMods<IntOpSelModsMatchClass>; -def FPRegSDWAInputModsMatchClass : AsmOperandClass { - let Name = "SDWARegWithFPInputMods"; - let ParserMethod = "parseRegWithFPInputMods"; - let PredicateMethod = "isSDWARegKind"; +class FPSDWAInputModsMatchClass <int opSize> : AsmOperandClass { + let Name = "SDWAWithFP"#opSize#"InputMods"; + let ParserMethod = "parseRegOrImmWithFPInputMods"; + let PredicateMethod = "isSDWAFP"#opSize#"Operand"; } -def FPRegSDWAInputMods : InputMods <FPRegSDWAInputModsMatchClass> { +def FP16SDWAInputModsMatchClass : FPSDWAInputModsMatchClass<16>; +def FP32SDWAInputModsMatchClass : FPSDWAInputModsMatchClass<32>; + +class FPSDWAInputMods <FPSDWAInputModsMatchClass matchClass> : + InputMods <matchClass> { let PrintMethod = "printOperandAndFPInputMods"; } +def FP16SDWAInputMods : FPSDWAInputMods<FP16SDWAInputModsMatchClass>; +def FP32SDWAInputMods : FPSDWAInputMods<FP32SDWAInputModsMatchClass>; + def FPVRegInputModsMatchClass : AsmOperandClass { let Name = "VRegWithFPInputMods"; let ParserMethod = "parseRegWithFPInputMods"; @@ -767,17 +880,23 @@ def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> { let PrintMethod = "printOperandAndFPInputMods"; } - -def IntRegSDWAInputModsMatchClass : AsmOperandClass { - let Name = "SDWARegWithIntInputMods"; - let ParserMethod = "parseRegWithIntInputMods"; - let PredicateMethod = "isSDWARegKind"; +class IntSDWAInputModsMatchClass <int opSize> : AsmOperandClass { + let Name = "SDWAWithInt"#opSize#"InputMods"; + let ParserMethod = "parseRegOrImmWithIntInputMods"; + let PredicateMethod = "isSDWAInt"#opSize#"Operand"; } -def IntRegSDWAInputMods : InputMods <IntRegSDWAInputModsMatchClass> { +def Int16SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<16>; +def Int32SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<32>; + +class IntSDWAInputMods <IntSDWAInputModsMatchClass matchClass> : + InputMods <matchClass> { let PrintMethod = "printOperandAndIntInputMods"; } +def Int16SDWAInputMods : IntSDWAInputMods<Int16SDWAInputModsMatchClass>; +def Int32SDWAInputMods : IntSDWAInputMods<Int32SDWAInputModsMatchClass>; + def IntVRegInputModsMatchClass : AsmOperandClass { let Name = "VRegWithIntInputMods"; let ParserMethod = "parseRegWithIntInputMods"; @@ -1023,7 +1142,12 @@ class getVregSrcForVT<ValueType VT> { } class getSDWASrcForVT <ValueType VT> { - RegisterOperand ret = !if(!eq(VT.Size, 16), SDWASrc16, SDWASrc32); + bit isFP = !if(!eq(VT.Value, f16.Value), 1, + !if(!eq(VT.Value, f32.Value), 1, + 0)); + RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32); + RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32); + RegisterOperand ret = !if(isFP, retFlt, retInt); } // Returns the register class to use for sources of VOP3 instructions for the @@ -1064,32 +1188,6 @@ class getVOP3SrcForVT<ValueType VT> { ); } -// Returns 1 if the source arguments have modifiers, 0 if they do not. -// XXX - do f16 instructions? -class isFloatType<ValueType SrcVT> { - bit ret = - !if(!eq(SrcVT.Value, f16.Value), 1, - !if(!eq(SrcVT.Value, f32.Value), 1, - !if(!eq(SrcVT.Value, f64.Value), 1, - !if(!eq(SrcVT.Value, v2f16.Value), 1, - 0)))); -} - -class isIntType<ValueType SrcVT> { - bit ret = - !if(!eq(SrcVT.Value, i16.Value), 1, - !if(!eq(SrcVT.Value, i32.Value), 1, - !if(!eq(SrcVT.Value, i64.Value), 1, - 0))); -} - -class isPackedType<ValueType SrcVT> { - bit ret = - !if(!eq(SrcVT.Value, v2i16.Value), 1, - !if(!eq(SrcVT.Value, v2f16.Value), 1, 0) - ); -} - // Float or packed int class isModifierType<ValueType SrcVT> { bit ret = @@ -1134,11 +1232,10 @@ class getSrcModExt <ValueType VT> { // Return type of input modifiers operand specified input operand for SDWA class getSrcModSDWA <ValueType VT> { - bit isFP = !if(!eq(VT.Value, f16.Value), 1, - !if(!eq(VT.Value, f32.Value), 1, - !if(!eq(VT.Value, f64.Value), 1, - 0))); - Operand ret = !if(isFP, FPRegSDWAInputMods, IntRegSDWAInputMods); + Operand ret = !if(!eq(VT.Value, f16.Value), FP16SDWAInputMods, + !if(!eq(VT.Value, f32.Value), FP32SDWAInputMods, + !if(!eq(VT.Value, i16.Value), Int16SDWAInputMods, + Int32SDWAInputMods))); } // Returns the input arguments for VOP[12C] instructions for the given SrcVT. @@ -1733,6 +1830,9 @@ def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>; def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>; def VOP_V4I32_I64_I32_V4I32 : VOPProfile <[v4i32, i64, i32, v4i32]>; +def VOP_F32_V2F16_V2F16_F32 : VOPProfile <[f32, v2f16, v2f16, f32]>; +def VOP_I32_V2I16_V2I16_I32 : VOPProfile <[i32, v2i16, v2i16, i32]>; + class Commutable_REV <string revOp, bit isOrig> { string RevOp = revOp; bit IsOrig = isOrig; @@ -1747,6 +1847,8 @@ class AtomicNoRet <string noRetOp, bit isRet> { // Interpolation opcodes //===----------------------------------------------------------------------===// +class VINTRPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVINTRPDst">; + class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : VINTRPCommon <outs, ins, "", pattern>, SIMCInstr<opName, SIEncodingFamily.NONE> { @@ -1823,38 +1925,6 @@ def getBasicFromSDWAOp : InstrMapping { let ValueCols = [["Default"]]; } -def getMaskedMIMGOp1 : InstrMapping { - let FilterClass = "MIMG_Mask"; - let RowFields = ["Op"]; - let ColFields = ["Channels"]; - let KeyCol = ["1"]; - let ValueCols = [["2"], ["3"], ["4"] ]; -} - -def getMaskedMIMGOp2 : InstrMapping { - let FilterClass = "MIMG_Mask"; - let RowFields = ["Op"]; - let ColFields = ["Channels"]; - let KeyCol = ["2"]; - let ValueCols = [["1"], ["3"], ["4"] ]; -} - -def getMaskedMIMGOp3 : InstrMapping { - let FilterClass = "MIMG_Mask"; - let RowFields = ["Op"]; - let ColFields = ["Channels"]; - let KeyCol = ["3"]; - let ValueCols = [["1"], ["2"], ["4"] ]; -} - -def getMaskedMIMGOp4 : InstrMapping { - let FilterClass = "MIMG_Mask"; - let RowFields = ["Op"]; - let ColFields = ["Channels"]; - let KeyCol = ["4"]; - let ValueCols = [["1"], ["2"], ["3"] ]; -} - // Maps an commuted opcode to its original version def getCommuteOrig : InstrMapping { let FilterClass = "Commutable_REV"; @@ -1882,6 +1952,11 @@ def getMCOpcodeGen : InstrMapping { [!cast<string>(SIEncodingFamily.VI)], [!cast<string>(SIEncodingFamily.SDWA)], [!cast<string>(SIEncodingFamily.SDWA9)], + // GFX80 encoding is added to work around a multiple matching + // issue for buffer instructions with unpacked d16 data. This + // does not actually change the encoding, and thus may be + // removed later. + [!cast<string>(SIEncodingFamily.GFX80)], [!cast<string>(SIEncodingFamily.GFX9)]]; } @@ -1902,6 +1977,14 @@ def getAddr64Inst : InstrMapping { let ValueCols = [["1"]]; } +def getMUBUFNoLdsInst : InstrMapping { + let FilterClass = "MUBUFLdsTable"; + let RowFields = ["OpName"]; + let ColFields = ["IsLds"]; + let KeyCol = ["1"]; + let ValueCols = [["0"]]; +} + // Maps an atomic opcode to its version with a return value. def getAtomicRetOp : InstrMapping { let FilterClass = "AtomicNoRet"; diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 9740a18b7248..c3f8bfb53ef4 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -11,18 +11,10 @@ // that are not yet supported remain commented out. //===----------------------------------------------------------------------===// -def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; -def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; -def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">, - AssemblerPredicate<"FeatureVGPRIndexMode">; -def HasMovrel : Predicate<"Subtarget->hasMovrel()">, - AssemblerPredicate<"FeatureMovrel">; - -class GCNPat<dag pattern, dag result> : AMDGPUPat<pattern, result> { +class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateControl { let SubtargetPredicate = isGCN; } - include "VOPInstructions.td" include "SOPInstructions.td" include "SMInstructions.td" @@ -40,15 +32,18 @@ defm EXP_DONE : EXP_m<1, AMDGPUexport_done>; // VINTRP Instructions //===----------------------------------------------------------------------===// +// Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI) +def VINTRPDst : VINTRPDstOperand <VGPR_32>; + let Uses = [M0, EXEC] in { // FIXME: Specify SchedRW for VINTRP insturctions. multiclass V_INTERP_P1_F32_m : VINTRP_m < 0x00000000, - (outs VGPR_32:$vdst), + (outs VINTRPDst:$vdst), (ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan), - "v_interp_p1_f32 $vdst, $vsrc, $attr$attrchan", + "v_interp_p1_f32$vdst, $vsrc, $attr$attrchan", [(set f32:$vdst, (AMDGPUinterp_p1 f32:$vsrc, (i32 imm:$attrchan), (i32 imm:$attr)))] >; @@ -69,9 +64,9 @@ let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in { defm V_INTERP_P2_F32 : VINTRP_m < 0x00000001, - (outs VGPR_32:$vdst), + (outs VINTRPDst:$vdst), (ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan), - "v_interp_p2_f32 $vdst, $vsrc, $attr$attrchan", + "v_interp_p2_f32$vdst, $vsrc, $attr$attrchan", [(set f32:$vdst, (AMDGPUinterp_p2 f32:$src0, f32:$vsrc, (i32 imm:$attrchan), (i32 imm:$attr)))]>; @@ -79,9 +74,9 @@ defm V_INTERP_P2_F32 : VINTRP_m < defm V_INTERP_MOV_F32 : VINTRP_m < 0x00000002, - (outs VGPR_32:$vdst), + (outs VINTRPDst:$vdst), (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan), - "v_interp_mov_f32 $vdst, $vsrc, $attr$attrchan", + "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan", [(set f32:$vdst, (AMDGPUinterp_mov (i32 imm:$vsrc), (i32 imm:$attrchan), (i32 imm:$attr)))]>; @@ -186,6 +181,7 @@ def S_XOR_B64_term : PseudoInstSI<(outs SReg_64:$dst), let SALU = 1; let isAsCheapAsAMove = 1; let isTerminator = 1; + let Defs = [SCC]; } def S_ANDN2_B64_term : PseudoInstSI<(outs SReg_64:$dst), @@ -246,7 +242,6 @@ def SI_IF: CFPseudoInstSI < def SI_ELSE : CFPseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> { - let Constraints = "$src = $dst"; let Size = 12; let hasSideEffects = 1; } @@ -296,14 +291,21 @@ def SI_ELSE_BREAK : CFPseudoInstSI < let isReMaterializable = 1; } -let Uses = [EXEC], Defs = [EXEC,VCC] in { +let Uses = [EXEC] in { multiclass PseudoInstKill <dag ins> { + // Even though this pseudo can usually be expanded without an SCC def, we + // conservatively assume that it has an SCC def, both because it is sometimes + // required in degenerate cases (when V_CMPX cannot be used due to constant + // bus limitations) and because it allows us to avoid having to track SCC + // liveness across basic blocks. + let Defs = [EXEC,VCC,SCC] in def _PSEUDO : PseudoInstSI <(outs), ins> { let isConvergent = 1; let usesCustomInserter = 1; } + let Defs = [EXEC,VCC,SCC] in def _TERMINATOR : SPseudoInstSI <(outs), ins> { let isTerminator = 1; } @@ -312,6 +314,7 @@ multiclass PseudoInstKill <dag ins> { defm SI_KILL_I1 : PseudoInstKill <(ins SSrc_b64:$src, i1imm:$killvalue)>; defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; +let Defs = [EXEC,VCC] in def SI_ILLEGAL_COPY : SPseudoInstSI < (outs unknown:$dst), (ins unknown:$src), [], " ; illegal copy $src to $dst">; @@ -371,6 +374,7 @@ def SI_RETURN_TO_EPILOG : SPseudoInstSI < let isReturn = 1; let hasNoSchedulingInfo = 1; let DisableWQM = 1; + let FixedSize = 1; } // Return for returning function calls. @@ -449,7 +453,7 @@ def ADJCALLSTACKDOWN : SPseudoInstSI< let usesCustomInserter = 1; } -let Defs = [M0, EXEC], +let Defs = [M0, EXEC, SCC], UseNamedOperandTable = 1 in { class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI < @@ -569,11 +573,6 @@ def : GCNPat< (SI_ELSE $src, $target, 0) >; -def : GCNPat < - (int_AMDGPU_kilp), - (SI_KILL_I1_PSEUDO (i1 0), 0) ->; - def : Pat < // -1.0 as i32 (LowerINTRINSIC_VOID converts all other constants to -1.0) (AMDGPUkill (i32 -1082130432)), @@ -643,6 +642,11 @@ def : GCNPat < >; def : GCNPat < + (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))), + (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)), DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +def : GCNPat < (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))), (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) >; @@ -700,15 +704,19 @@ multiclass FMADPat <ValueType vt, Instruction inst> { defm : FMADPat <f16, V_MAC_F16_e64>; defm : FMADPat <f32, V_MAC_F32_e64>; -class FMADModsPat<Instruction inst, SDPatternOperator mad_opr> : GCNPat< - (f32 (mad_opr (VOP3Mods f32:$src0, i32:$src0_mod), - (VOP3Mods f32:$src1, i32:$src1_mod), - (VOP3Mods f32:$src2, i32:$src2_mod))), +class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty> + : GCNPat< + (Ty (mad_opr (VOP3Mods Ty:$src0, i32:$src0_mod), + (VOP3Mods Ty:$src1, i32:$src1_mod), + (VOP3Mods Ty:$src2, i32:$src2_mod))), (inst $src0_mod, $src0, $src1_mod, $src1, $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) >; -def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz>; +def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz, f32>; +def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> { + let SubtargetPredicate = Has16BitInsts; +} multiclass SelectPat <ValueType vt, Instruction inst> { def : GCNPat < @@ -726,6 +734,10 @@ def : GCNPat < (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)), (V_BCNT_U32_B32_e64 $popcnt, $val) >; +def : GCNPat < + (i16 (add (i16 (trunc (ctpop i32:$popcnt))), i16:$val)), + (V_BCNT_U32_B32_e64 $popcnt, $val) +>; /********** ============================================ **********/ /********** Extraction, Insertion, Building and Casting **********/ @@ -795,6 +807,27 @@ foreach Index = 0-15 in { >; } + +def : Pat < + (extract_subvector v4i16:$vec, (i32 0)), + (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub0)) +>; + +def : Pat < + (extract_subvector v4i16:$vec, (i32 2)), + (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub1)) +>; + +def : Pat < + (extract_subvector v4f16:$vec, (i32 0)), + (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0)) +>; + +def : Pat < + (extract_subvector v4f16:$vec, (i32 2)), + (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1)) +>; + let SubtargetPredicate = isGCN in { // FIXME: Why do only some of these type combinations for SReg and @@ -834,6 +867,26 @@ def : BitConvert <f64, v2f32, VReg_64>; def : BitConvert <v2f32, f64, VReg_64>; def : BitConvert <f64, v2i32, VReg_64>; def : BitConvert <v2i32, f64, VReg_64>; + +// FIXME: Make SGPR +def : BitConvert <v2i32, v4f16, VReg_64>; +def : BitConvert <v4f16, v2i32, VReg_64>; +def : BitConvert <v2i32, v4f16, VReg_64>; +def : BitConvert <v2i32, v4i16, VReg_64>; +def : BitConvert <v4i16, v2i32, VReg_64>; +def : BitConvert <v2f32, v4f16, VReg_64>; +def : BitConvert <v4f16, v2f32, VReg_64>; +def : BitConvert <v2f32, v4i16, VReg_64>; +def : BitConvert <v4i16, v2f32, VReg_64>; +def : BitConvert <v4i16, f64, VReg_64>; +def : BitConvert <v4f16, f64, VReg_64>; +def : BitConvert <f64, v4i16, VReg_64>; +def : BitConvert <f64, v4f16, VReg_64>; +def : BitConvert <v4i16, i64, VReg_64>; +def : BitConvert <v4f16, i64, VReg_64>; +def : BitConvert <i64, v4i16, VReg_64>; +def : BitConvert <i64, v4f16, VReg_64>; + def : BitConvert <v4i32, v4f32, VReg_128>; def : BitConvert <v4f32, v4i32, VReg_128>; @@ -876,11 +929,13 @@ def : ClampPat<V_MAX_F32_e64, f32>; def : ClampPat<V_MAX_F64, f64>; def : ClampPat<V_MAX_F16_e64, f16>; +let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat < (v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))), (V_PK_MAX_F16 $src0_modifiers, $src0, $src0_modifiers, $src0, DSTCLAMP.ENABLE) >; +} /********** ================================ **********/ /********** Floating point absolute/negative **********/ @@ -906,7 +961,7 @@ def : GCNPat < def : GCNPat < (fabs f32:$src), - (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff))) + (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fffffff))) >; def : GCNPat < @@ -967,12 +1022,12 @@ def : GCNPat < def : GCNPat < (fneg f16:$src), - (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000))) + (S_XOR_B32 $src, (S_MOV_B32 (i32 0x00008000))) >; def : GCNPat < (fabs f16:$src), - (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x00007fff))) + (S_AND_B32 $src, (S_MOV_B32 (i32 0x00007fff))) >; def : GCNPat < @@ -982,12 +1037,12 @@ def : GCNPat < def : GCNPat < (fneg v2f16:$src), - (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), $src) + (S_XOR_B32 $src, (S_MOV_B32 (i32 0x80008000))) >; def : GCNPat < (fabs v2f16:$src), - (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src) + (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fff7fff))) >; // This is really (fneg (fabs v2f16:$src)) @@ -996,7 +1051,12 @@ def : GCNPat < // VOP3P instructions, so it is turned into the bit op. def : GCNPat < (fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))), - (S_OR_B32 (S_MOV_B32 (i32 0x80008000)), $src) // Set sign bit + (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit +>; + +def : GCNPat < + (fneg (v2f16 (fabs v2f16:$src))), + (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit >; /********** ================== **********/ @@ -1097,6 +1157,7 @@ let SubtargetPredicate = isGCN in { def : IMad24Pat<V_MAD_I32_I24, 1>; def : UMad24Pat<V_MAD_U32_U24, 1>; +// FIXME: This should only be done for VALU inputs defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>; def : ROTRPattern <V_ALIGNBIT_B32>; @@ -1337,11 +1398,13 @@ def : GCNPat< (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0) >; +let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat< (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE) >; } +} let OtherPredicates = [NoFP32Denormals] in { def : GCNPat< @@ -1371,6 +1434,16 @@ def : GCNPat< >; } +let OtherPredicates = [HasDLInsts] in { +def : GCNPat < + (fma (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)), + (f32 (VOP3NoMods f32:$src2))), + (V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, + SRCMODS.NONE, $src2, $clamp, $omod) +>; +} // End OtherPredicates = [HasDLInsts] + // Allow integer inputs class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat< @@ -1381,11 +1454,6 @@ class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPa def : ExpPattern<AMDGPUexport, i32, EXP>; def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>; -def : GCNPat < - (v2i16 (build_vector i16:$src0, i16:$src1)), - (v2i16 (S_PACK_LL_B32_B16 $src0, $src1)) ->; - // COPY_TO_REGCLASS is workaround tablegen bug from multiple outputs // from S_LSHL_B32's multiple outputs from implicit scc def. def : GCNPat < @@ -1393,6 +1461,13 @@ def : GCNPat < (v2i16 (COPY_TO_REGCLASS (S_LSHL_B32 i16:$src1, (i16 16)), SReg_32_XM0)) >; + +let SubtargetPredicate = HasVOP3PInsts in { +def : GCNPat < + (v2i16 (build_vector i16:$src0, i16:$src1)), + (v2i16 (S_PACK_LL_B32_B16 $src0, $src1)) +>; + // With multiple uses of the shift, this will duplicate the shift and // increase register pressure. def : GCNPat < @@ -1400,6 +1475,7 @@ def : GCNPat < (v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1)) >; + def : GCNPat < (v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))), (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))), @@ -1412,6 +1488,9 @@ def : GCNPat < (v2f16 (S_PACK_LL_B32_B16 $src0, $src1)) >; +} // End SubtargetPredicate = HasVOP3PInsts + + // def : GCNPat < // (v2f16 (scalar_to_vector f16:$src0)), // (COPY $src0) @@ -1422,6 +1501,16 @@ def : GCNPat < // (COPY $src0) // >; +def : GCNPat < + (v4i16 (scalar_to_vector i16:$src0)), + (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0) +>; + +def : GCNPat < + (v4f16 (scalar_to_vector f16:$src0)), + (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0) +>; + //===----------------------------------------------------------------------===// // Fract Patterns //===----------------------------------------------------------------------===// @@ -1486,7 +1575,7 @@ defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>; // FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>; defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>; -def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>; +defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>; def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>; def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>; diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 84cd47a101a8..4b537540046f 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -45,6 +45,7 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" @@ -102,7 +103,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass { }; private: - const SISubtarget *STM = nullptr; + const GCNSubtarget *STM = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; MachineRegisterInfo *MRI = nullptr; @@ -137,7 +138,7 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; - StringRef getPassName() const override { return "SI Load / Store Optimizer"; } + StringRef getPassName() const override { return "SI Load Store Optimizer"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -150,10 +151,10 @@ public: } // end anonymous namespace. INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, - "SI Load / Store Optimizer", false, false) + "SI Load Store Optimizer", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, - "SI Load / Store Optimizer", false, false) + "SI Load Store Optimizer", false, false) char SILoadStoreOptimizer::ID = 0; @@ -173,10 +174,18 @@ static void moveInstsAfter(MachineBasicBlock::iterator I, } } -static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) { - // XXX: Should this be looking for implicit defs? - for (const MachineOperand &Def : MI.defs()) - Defs.insert(Def.getReg()); +static void addDefsUsesToList(const MachineInstr &MI, + DenseSet<unsigned> &RegDefs, + DenseSet<unsigned> &PhysRegUses) { + for (const MachineOperand &Op : MI.operands()) { + if (Op.isReg()) { + if (Op.isDef()) + RegDefs.insert(Op.getReg()); + else if (Op.readsReg() && + TargetRegisterInfo::isPhysicalRegister(Op.getReg())) + PhysRegUses.insert(Op.getReg()); + } + } } static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, @@ -194,16 +203,24 @@ static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, // already in the list. Returns true in that case. static bool addToListsIfDependent(MachineInstr &MI, - DenseSet<unsigned> &Defs, + DenseSet<unsigned> &RegDefs, + DenseSet<unsigned> &PhysRegUses, SmallVectorImpl<MachineInstr*> &Insts) { for (MachineOperand &Use : MI.operands()) { // If one of the defs is read, then there is a use of Def between I and the // instruction that I will potentially be merged with. We will need to move // this instruction after the merged instructions. - - if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) { + // + // Similarly, if there is a def which is read by an instruction that is to + // be moved for merging, then we need to move the def-instruction as well. + // This can only happen for physical registers such as M0; virtual + // registers are in SSA form. + if (Use.isReg() && + ((Use.readsReg() && RegDefs.count(Use.getReg())) || + (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) && + PhysRegUses.count(Use.getReg())))) { Insts.push_back(&MI); - addDefsToList(MI, Defs); + addDefsUsesToList(MI, RegDefs, PhysRegUses); return true; } } @@ -332,8 +349,9 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { ++MBBI; - DenseSet<unsigned> DefsToMove; - addDefsToList(*CI.I, DefsToMove); + DenseSet<unsigned> RegDefsToMove; + DenseSet<unsigned> PhysRegUsesToMove; + addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); for ( ; MBBI != E; ++MBBI) { if (MBBI->getOpcode() != CI.I->getOpcode()) { @@ -356,14 +374,15 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { // #2. Add this instruction to the move list and then we will check // if condition #2 holds once we have selected the matching instruction. CI.InstsToMove.push_back(&*MBBI); - addDefsToList(*MBBI, DefsToMove); + addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); continue; } // When we match I with another DS instruction we will be moving I down // to the location of the matched instruction any uses of I will need to // be moved down as well. - addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove); + addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, + CI.InstsToMove); continue; } @@ -377,7 +396,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { // DS_WRITE_B32 addr, f(w), idx1 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents // merging of the two writes. - if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove)) + if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, + CI.InstsToMove)) continue; bool Match = true; @@ -436,7 +456,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { // down past this instruction. // check if we can move I across MBBI and if we can move all I's users if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) break; } return false; @@ -496,13 +516,15 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( unsigned BaseReg = AddrReg->getReg(); unsigned BaseRegFlags = 0; if (CI.BaseOff) { + unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); + BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) + .addImm(CI.BaseOff); + BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; - unsigned AddOpc = STM->hasAddNoCarry() ? - AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32; - BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg) - .addImm(CI.BaseOff) + TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) + .addReg(ImmReg) .addReg(AddrReg->getReg()); } @@ -532,7 +554,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); + LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); return Next; } @@ -556,7 +578,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( // Be sure to use .addOperand(), and not .addReg() with these. We want to be // sure we preserve the subregister index and any register flags set on them. - const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); + const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); const MachineOperand *Data1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); @@ -579,17 +601,19 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( const MCInstrDesc &Write2Desc = TII->get(Opc); DebugLoc DL = CI.I->getDebugLoc(); - unsigned BaseReg = Addr->getReg(); + unsigned BaseReg = AddrReg->getReg(); unsigned BaseRegFlags = 0; if (CI.BaseOff) { + unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); + BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) + .addImm(CI.BaseOff); + BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; - unsigned AddOpc = STM->hasAddNoCarry() ? - AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32; - BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg) - .addImm(CI.BaseOff) - .addReg(Addr->getReg()); + TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) + .addReg(ImmReg) + .addReg(AddrReg->getReg()); } MachineInstrBuilder Write2 = @@ -608,7 +632,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); + LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); return Next; } @@ -849,9 +873,8 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { continue; } - if (STM->hasSBufferLoadStoreAtomicDwordxN() && - (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || - Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) { + if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || + Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) { // EltSize is in units of the offset encoding. CI.InstClass = S_BUFFER_LOAD_IMM; CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); @@ -916,7 +939,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; - STM = &MF.getSubtarget<SISubtarget>(); + STM = &MF.getSubtarget<GCNSubtarget>(); if (!STM->loadStoreOptEnabled()) return false; @@ -928,7 +951,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { assert(MRI->isSSA() && "Must be run on SSA"); - DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); + LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); bool Modified = false; diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp index a9af83323976..ad30317c344c 100644 --- a/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief This pass lowers the pseudo control flow instructions to real +/// This pass lowers the pseudo control flow instructions to real /// machine instructions. /// /// All control flow is handled using predicated instructions and @@ -51,6 +51,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/LiveIntervals.h" @@ -343,11 +344,49 @@ void SILowerControlFlow::emitBreak(MachineInstr &MI) { } void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { - MI.setDesc(TII->get(AMDGPU::S_OR_B64)); + MachineBasicBlock &MBB = *MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + auto Dst = MI.getOperand(0).getReg(); + + // Skip ANDing with exec if the break condition is already masked by exec + // because it is a V_CMP in the same basic block. (We know the break + // condition operand was an i1 in IR, so if it is a VALU instruction it must + // be one with a carry-out.) + bool SkipAnding = false; + if (MI.getOperand(1).isReg()) { + if (MachineInstr *Def = MRI->getUniqueVRegDef(MI.getOperand(1).getReg())) { + SkipAnding = Def->getParent() == MI.getParent() + && SIInstrInfo::isVALU(*Def); + } + } + + // AND the break condition operand with exec, then OR that into the "loop + // exit" mask. + MachineInstr *And = nullptr, *Or = nullptr; + if (!SkipAnding) { + And = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst) + .addReg(AMDGPU::EXEC) + .add(MI.getOperand(1)); + Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(Dst) + .add(MI.getOperand(2)); + } else + Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)); + + if (LIS) { + if (And) + LIS->InsertMachineInstrInMaps(*And); + LIS->ReplaceMachineInstrInMaps(MI, *Or); + } + + MI.eraseFromParent(); } void SILowerControlFlow::emitElseBreak(MachineInstr &MI) { - MI.setDesc(TII->get(AMDGPU::S_OR_B64)); + // Lowered in the same way as emitIfBreak above. + emitIfBreak(MI); } void SILowerControlFlow::emitLoop(MachineInstr &MI) { @@ -414,8 +453,8 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo, return; for (const auto &SrcOp : Def->explicit_operands()) - if (SrcOp.isUse() && (!SrcOp.isReg() || - TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) || + if (SrcOp.isReg() && SrcOp.isUse() && + (TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) || SrcOp.getReg() == AMDGPU::EXEC)) Src.push_back(SrcOp); } @@ -447,7 +486,7 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) { } bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp index da57b90dd8c4..ecc6cff407e1 100644 --- a/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -17,6 +17,8 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "Utils/AMDGPULaneDominator.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -64,7 +66,7 @@ FunctionPass *llvm::createSILowerI1CopiesPass() { bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const TargetRegisterInfo *TRI = &TII->getRegisterInfo(); @@ -141,7 +143,8 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { DefInst->getOperand(3).getReg()) && TRI->getCommonSubClass( MRI.getRegClass(DefInst->getOperand(3).getReg()), - &AMDGPU::SGPR_64RegClass)) { + &AMDGPU::SGPR_64RegClass) && + AMDGPU::laneDominates(DefInst->getParent(), &MBB)) { BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64)) .add(Dst) .addReg(AMDGPU::EXEC) diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 6013ebc81d9f..0d5ff75e37ed 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -11,6 +11,7 @@ #include "AMDGPUArgumentUsageInfo.h" #include "AMDGPUSubtarget.h" #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/Optional.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -28,17 +29,12 @@ using namespace llvm; SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), - BufferPSV(*(MF.getSubtarget().getInstrInfo())), - ImagePSV(*(MF.getSubtarget().getInstrInfo())), PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), KernargSegmentPtr(false), DispatchID(false), FlatScratchInit(false), - GridWorkgroupCountX(false), - GridWorkgroupCountY(false), - GridWorkgroupCountZ(false), WorkGroupIDX(false), WorkGroupIDY(false), WorkGroupIDZ(false), @@ -49,12 +45,26 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkItemIDZ(false), ImplicitBufferPtr(false), ImplicitArgPtr(false), - GITPtrHigh(0xffffffff) { - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + GITPtrHigh(0xffffffff), + HighBitsOf32BitAddress(0) { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const Function &F = MF.getFunction(); FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); WavesPerEU = ST.getWavesPerEU(F); + Occupancy = getMaxWavesPerEU(); + limitOccupancy(MF); + CallingConv::ID CC = F.getCallingConv(); + + if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { + if (!F.arg_empty()) + KernargSegmentPtr = true; + WorkGroupIDX = true; + WorkItemIDX = true; + } else if (CC == CallingConv::AMDGPU_PS) { + PSInputAddr = AMDGPU::getInitialPSInputAddr(F); + } + if (!isEntryFunction()) { // Non-entry functions have no special inputs for now, other registers // required for scratch access. @@ -71,18 +81,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) ImplicitArgPtr = true; } else { - if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) - KernargSegmentPtr = true; - } - - CallingConv::ID CC = F.getCallingConv(); - if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { - if (!F.arg_empty()) + if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) { KernargSegmentPtr = true; - WorkGroupIDX = true; - WorkItemIDX = true; - } else if (CC == CallingConv::AMDGPU_PS) { - PSInputAddr = AMDGPU::getInitialPSInputAddr(F); + MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(), + MaxKernArgAlign); + } } if (ST.debuggerEmitPrologue()) { @@ -134,7 +137,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) } } - bool IsCOV2 = ST.isAmdCodeObjectV2(MF); + bool IsCOV2 = ST.isAmdCodeObjectV2(F); if (IsCOV2) { if (HasStackObjects || MaySpill) PrivateSegmentBuffer = true; @@ -147,7 +150,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (F.hasFnAttribute("amdgpu-dispatch-id")) DispatchID = true; - } else if (ST.isMesaGfxShader(MF)) { + } else if (ST.isMesaGfxShader(F)) { if (HasStackObjects || MaySpill) ImplicitBufferPtr = true; } @@ -166,6 +169,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) StringRef S = A.getValueAsString(); if (!S.empty()) S.consumeInteger(0, GITPtrHigh); + + A = F.getFnAttribute("amdgpu-32bit-address-high-bits"); + S = A.getValueAsString(); + if (!S.empty()) + S.consumeInteger(0, HighBitsOf32BitAddress); +} + +void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) { + limitOccupancy(getMaxWavesPerEU()); + const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>(); + limitOccupancy(ST.getOccupancyWithLocalMemSize(getLDSSize(), + MF.getFunction())); } unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( @@ -238,7 +253,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, if (!SpillLanes.empty()) return true; - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineFrameInfo &FrameInfo = MF.getFrameInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -269,10 +284,9 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, } Optional<int> CSRSpillFI; - if (FrameInfo.hasCalls() && CSRegs && isCalleeSavedReg(CSRegs, LaneVGPR)) { - // TODO: Should this be a CreateSpillStackObject? This is technically a - // weird CSR spill. - CSRSpillFI = FrameInfo.CreateStackObject(4, 4, false); + if ((FrameInfo.hasCalls() || !isEntryFunction()) && CSRegs && + isCalleeSavedReg(CSRegs, LaneVGPR)) { + CSRSpillFI = FrameInfo.CreateSpillStackObject(4, 4); } SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI)); @@ -295,3 +309,29 @@ void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) for (auto &R : SGPRToVGPRSpills) MFI.RemoveStackObject(R.first); } + + +/// \returns VGPR used for \p Dim' work item ID. +unsigned SIMachineFunctionInfo::getWorkItemIDVGPR(unsigned Dim) const { + switch (Dim) { + case 0: + assert(hasWorkItemIDX()); + return AMDGPU::VGPR0; + case 1: + assert(hasWorkItemIDY()); + return AMDGPU::VGPR1; + case 2: + assert(hasWorkItemIDZ()); + return AMDGPU::VGPR2; + } + llvm_unreachable("unexpected dimension"); +} + +MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const { + assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); + return AMDGPU::SGPR0 + NumUserSGPRs; +} + +MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const { + return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; +} diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 5dde72910ee3..ef91d1e43075 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -16,7 +16,9 @@ #include "AMDGPUArgumentUsageInfo.h" #include "AMDGPUMachineFunction.h" +#include "SIInstrInfo.h" #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Optional.h" @@ -38,8 +40,9 @@ class TargetRegisterClass; class AMDGPUImagePseudoSourceValue : public PseudoSourceValue { public: + // TODO: Is the img rsrc useful? explicit AMDGPUImagePseudoSourceValue(const TargetInstrInfo &TII) : - PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) { } + PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) {} bool isConstant(const MachineFrameInfo *) const override { // This should probably be true for most images, but we will start by being @@ -48,15 +51,11 @@ public: } bool isAliased(const MachineFrameInfo *) const override { - // FIXME: If we ever change image intrinsics to accept fat pointers, then - // this could be true for some cases. - return false; + return true; } bool mayAlias(const MachineFrameInfo *) const override { - // FIXME: If we ever change image intrinsics to accept fat pointers, then - // this could be true for some cases. - return false; + return true; } }; @@ -72,15 +71,11 @@ public: } bool isAliased(const MachineFrameInfo *) const override { - // FIXME: If we ever change image intrinsics to accept fat pointers, then - // this could be true for some cases. - return false; + return true; } bool mayAlias(const MachineFrameInfo *) const override { - // FIXME: If we ever change image intrinsics to accept fat pointers, then - // this could be true for some cases. - return false; + return true; } }; @@ -135,8 +130,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { // Stack object indices for work item IDs. std::array<int, 3> DebuggerWorkItemIDStackObjectIndices = {{0, 0, 0}}; - AMDGPUBufferPseudoSourceValue BufferPSV; - AMDGPUImagePseudoSourceValue ImagePSV; + DenseMap<const Value *, + std::unique_ptr<const AMDGPUBufferPseudoSourceValue>> BufferPSVs; + DenseMap<const Value *, + std::unique_ptr<const AMDGPUImagePseudoSourceValue>> ImagePSVs; private: unsigned LDSWaveSpillSize = 0; @@ -146,6 +143,7 @@ private: bool HasSpilledSGPRs = false; bool HasSpilledVGPRs = false; bool HasNonSpillStackObjects = false; + bool IsStackRealigned = false; unsigned NumSpilledSGPRs = 0; unsigned NumSpilledVGPRs = 0; @@ -157,9 +155,6 @@ private: bool KernargSegmentPtr : 1; bool DispatchID : 1; bool FlatScratchInit : 1; - bool GridWorkgroupCountX : 1; - bool GridWorkgroupCountY : 1; - bool GridWorkgroupCountZ : 1; // Feature bits required for inputs passed in system SGPRs. bool WorkGroupIDX : 1; // Always initialized. @@ -186,25 +181,25 @@ private: // current hardware only allows a 16 bit value. unsigned GITPtrHigh; - MCPhysReg getNextUserSGPR() const { - assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); - return AMDGPU::SGPR0 + NumUserSGPRs; - } + unsigned HighBitsOf32BitAddress; - MCPhysReg getNextSystemSGPR() const { - return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; - } + // Current recorded maximum possible occupancy. + unsigned Occupancy; + + MCPhysReg getNextUserSGPR() const; + + MCPhysReg getNextSystemSGPR() const; public: struct SpilledReg { - unsigned VGPR = AMDGPU::NoRegister; + unsigned VGPR = 0; int Lane = -1; SpilledReg() = default; SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) {} bool hasLane() { return Lane != -1;} - bool hasReg() { return VGPR != AMDGPU::NoRegister;} + bool hasReg() { return VGPR != 0;} }; struct SGPRSpillVGPRCSR { @@ -244,8 +239,8 @@ public: bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI); - bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; } - unsigned getTIDReg() const { return TIDReg; } + bool hasCalculatedTID() const { return TIDReg != 0; }; + unsigned getTIDReg() const { return TIDReg; }; void setTIDReg(unsigned Reg) { TIDReg = Reg; } unsigned getBytesInStackArgArea() const { @@ -338,18 +333,6 @@ public: return FlatScratchInit; } - bool hasGridWorkgroupCountX() const { - return GridWorkgroupCountX; - } - - bool hasGridWorkgroupCountY() const { - return GridWorkgroupCountY; - } - - bool hasGridWorkgroupCountZ() const { - return GridWorkgroupCountZ; - } - bool hasWorkGroupIDX() const { return WorkGroupIDX; } @@ -411,6 +394,10 @@ public: return GITPtrHigh; } + unsigned get32BitAddressHighBits() const { + return HighBitsOf32BitAddress; + } + unsigned getNumUserSGPRs() const { return NumUserSGPRs; } @@ -423,14 +410,14 @@ public: return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); } - /// \brief Returns the physical register reserved for use as the resource + /// Returns the physical register reserved for use as the resource /// descriptor for scratch accesses. unsigned getScratchRSrcReg() const { return ScratchRSrcReg; } void setScratchRSrcReg(unsigned Reg) { - assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + assert(Reg != 0 && "Should never be unset"); ScratchRSrcReg = Reg; } @@ -443,6 +430,7 @@ public: } void setStackPtrOffsetReg(unsigned Reg) { + assert(Reg != 0 && "Should never be unset"); StackPtrOffsetReg = Reg; } @@ -455,7 +443,7 @@ public: } void setScratchWaveOffsetReg(unsigned Reg) { - assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + assert(Reg != 0 && "Should never be unset"); ScratchWaveOffsetReg = Reg; if (isEntryFunction()) FrameOffsetReg = ScratchWaveOffsetReg; @@ -493,6 +481,14 @@ public: HasNonSpillStackObjects = StackObject; } + bool isStackRealigned() const { + return IsStackRealigned; + } + + void setIsStackRealigned(bool Realigned = true) { + IsStackRealigned = Realigned; + } + unsigned getNumSpilledSGPRs() const { return NumSpilledSGPRs; } @@ -575,7 +571,7 @@ public: return DebuggerWorkGroupIDStackObjectIndices[Dim]; } - /// \brief Sets stack object index for \p Dim's work group ID to \p ObjectIdx. + /// Sets stack object index for \p Dim's work group ID to \p ObjectIdx. void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) { assert(Dim < 3); DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx; @@ -587,7 +583,7 @@ public: return DebuggerWorkItemIDStackObjectIndices[Dim]; } - /// \brief Sets stack object index for \p Dim's work item ID to \p ObjectIdx. + /// Sets stack object index for \p Dim's work item ID to \p ObjectIdx. void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) { assert(Dim < 3); DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx; @@ -610,31 +606,51 @@ public: } /// \returns VGPR used for \p Dim' work item ID. - unsigned getWorkItemIDVGPR(unsigned Dim) const { - switch (Dim) { - case 0: - assert(hasWorkItemIDX()); - return AMDGPU::VGPR0; - case 1: - assert(hasWorkItemIDY()); - return AMDGPU::VGPR1; - case 2: - assert(hasWorkItemIDZ()); - return AMDGPU::VGPR2; - } - llvm_unreachable("unexpected dimension"); - } + unsigned getWorkItemIDVGPR(unsigned Dim) const; unsigned getLDSWaveSpillSize() const { return LDSWaveSpillSize; } - const AMDGPUBufferPseudoSourceValue *getBufferPSV() const { - return &BufferPSV; + const AMDGPUBufferPseudoSourceValue *getBufferPSV(const SIInstrInfo &TII, + const Value *BufferRsrc) { + assert(BufferRsrc); + auto PSV = BufferPSVs.try_emplace( + BufferRsrc, + llvm::make_unique<AMDGPUBufferPseudoSourceValue>(TII)); + return PSV.first->second.get(); + } + + const AMDGPUImagePseudoSourceValue *getImagePSV(const SIInstrInfo &TII, + const Value *ImgRsrc) { + assert(ImgRsrc); + auto PSV = ImagePSVs.try_emplace( + ImgRsrc, + llvm::make_unique<AMDGPUImagePseudoSourceValue>(TII)); + return PSV.first->second.get(); + } + + unsigned getOccupancy() const { + return Occupancy; + } + + unsigned getMinAllowedOccupancy() const { + if (!isMemoryBound() && !needsWaveLimiter()) + return Occupancy; + return (Occupancy < 4) ? Occupancy : 4; + } + + void limitOccupancy(const MachineFunction &MF); + + void limitOccupancy(unsigned Limit) { + if (Occupancy > Limit) + Occupancy = Limit; } - const AMDGPUImagePseudoSourceValue *getImagePSV() const { - return &ImagePSV; + void increaseOccupancy(const MachineFunction &MF, unsigned Limit) { + if (Occupancy < Limit) + Occupancy = Limit; + limitOccupancy(MF); } }; diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp index 6b67b76652ed..18754442898f 100644 --- a/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief SI Machine Scheduler interface +/// SI Machine Scheduler interface // //===----------------------------------------------------------------------===// @@ -16,6 +16,7 @@ #include "AMDGPU.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/LiveInterval.h" @@ -154,6 +155,8 @@ static const char *getReasonStr(SIScheduleCandReason Reason) { #endif +namespace llvm { +namespace SISched { static bool tryLess(int TryVal, int CandVal, SISchedulerCandidate &TryCand, SISchedulerCandidate &Cand, @@ -187,6 +190,8 @@ static bool tryGreater(int TryVal, int CandVal, Cand.setRepeat(Reason); return false; } +} // end namespace SISched +} // end namespace llvm // SIScheduleBlock // @@ -212,7 +217,8 @@ void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand, } if (Cand.SGPRUsage > 60 && - tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage)) + SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, + TryCand, Cand, RegUsage)) return; // Schedule low latency instructions as top as possible. @@ -230,21 +236,22 @@ void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand, // could go quite high, thus above the arbitrary limit of 60 will encourage // use the already loaded constants (in order to release some SGPRs) before // loading more. - if (tryLess(TryCand.HasLowLatencyNonWaitedParent, - Cand.HasLowLatencyNonWaitedParent, - TryCand, Cand, SIScheduleCandReason::Depth)) + if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent, + Cand.HasLowLatencyNonWaitedParent, + TryCand, Cand, SIScheduleCandReason::Depth)) return; - if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency, - TryCand, Cand, SIScheduleCandReason::Depth)) + if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency, + TryCand, Cand, SIScheduleCandReason::Depth)) return; if (TryCand.IsLowLatency && - tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset, - TryCand, Cand, SIScheduleCandReason::Depth)) + SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset, + TryCand, Cand, SIScheduleCandReason::Depth)) return; - if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage)) + if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, + TryCand, Cand, RegUsage)) return; // Fall through to original instruction order. @@ -1201,7 +1208,7 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria NextReservedID = 1; NextNonReservedID = DAGSize + 1; - DEBUG(dbgs() << "Coloring the graph\n"); + LLVM_DEBUG(dbgs() << "Coloring the graph\n"); if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped) colorHighLatenciesGroups(); @@ -1258,13 +1265,11 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria SIScheduleBlock *Block = CurrentBlocks[i]; Block->finalizeUnits(); } - DEBUG( - dbgs() << "Blocks created:\n\n"; - for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { - SIScheduleBlock *Block = CurrentBlocks[i]; - Block->printDebug(true); - } - ); + LLVM_DEBUG(dbgs() << "Blocks created:\n\n"; + for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + Block->printDebug(true); + }); } // Two functions taken from Codegen/MachineScheduler.cpp @@ -1274,7 +1279,7 @@ static MachineBasicBlock::iterator nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::const_iterator End) { for (; I != End; ++I) { - if (!I->isDebugValue()) + if (!I->isDebugInstr()) break; } return I; @@ -1284,7 +1289,7 @@ void SIScheduleBlockCreator::topologicalSort() { unsigned DAGSize = CurrentBlocks.size(); std::vector<int> WorkList; - DEBUG(dbgs() << "Topological Sort\n"); + LLVM_DEBUG(dbgs() << "Topological Sort\n"); WorkList.reserve(DAGSize); TopDownIndex2Block.resize(DAGSize); @@ -1331,11 +1336,11 @@ void SIScheduleBlockCreator::topologicalSort() { void SIScheduleBlockCreator::scheduleInsideBlocks() { unsigned DAGSize = CurrentBlocks.size(); - DEBUG(dbgs() << "\nScheduling Blocks\n\n"); + LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n"); // We do schedule a valid scheduling such that a Block corresponds // to a range of instructions. - DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n"); + LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n"); for (unsigned i = 0, e = DAGSize; i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks[i]; Block->fastSchedule(); @@ -1389,7 +1394,7 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() { Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr()); } - DEBUG(dbgs() << "Restoring MI Pos\n"); + LLVM_DEBUG(dbgs() << "Restoring MI Pos\n"); // Restore old ordering (which prevents a LIS->handleMove bug). for (unsigned i = PosOld.size(), e = 0; i != e; --i) { MachineBasicBlock::iterator POld = PosOld[i-1]; @@ -1403,12 +1408,10 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() { } } - DEBUG( - for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { - SIScheduleBlock *Block = CurrentBlocks[i]; - Block->printDebug(true); - } - ); + LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { + SIScheduleBlock *Block = CurrentBlocks[i]; + Block->printDebug(true); + }); } void SIScheduleBlockCreator::fillStats() { @@ -1559,13 +1562,10 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, blockScheduled(Block); } - DEBUG( - dbgs() << "Block Order:"; - for (SIScheduleBlock* Block : BlocksScheduled) { - dbgs() << ' ' << Block->getID(); - } - dbgs() << '\n'; - ); + LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block + : BlocksScheduled) { + dbgs() << ' ' << Block->getID(); + } dbgs() << '\n';); } bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand, @@ -1576,19 +1576,19 @@ bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand, } // Try to hide high latencies. - if (tryLess(TryCand.LastPosHighLatParentScheduled, - Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency)) + if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled, + Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency)) return true; // Schedule high latencies early so you can hide them better. - if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency, - TryCand, Cand, Latency)) + if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency, + TryCand, Cand, Latency)) return true; - if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height, - TryCand, Cand, Depth)) + if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height, + TryCand, Cand, Depth)) return true; - if (tryGreater(TryCand.NumHighLatencySuccessors, - Cand.NumHighLatencySuccessors, - TryCand, Cand, Successor)) + if (SISched::tryGreater(TryCand.NumHighLatencySuccessors, + Cand.NumHighLatencySuccessors, + TryCand, Cand, Successor)) return true; return false; } @@ -1600,17 +1600,17 @@ bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand, return true; } - if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0, - TryCand, Cand, RegUsage)) + if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0, + TryCand, Cand, RegUsage)) return true; - if (tryGreater(TryCand.NumSuccessors > 0, - Cand.NumSuccessors > 0, - TryCand, Cand, Successor)) + if (SISched::tryGreater(TryCand.NumSuccessors > 0, + Cand.NumSuccessors > 0, + TryCand, Cand, Successor)) return true; - if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth)) + if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth)) return true; - if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff, - TryCand, Cand, RegUsage)) + if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff, + TryCand, Cand, RegUsage)) return true; return false; } @@ -1628,18 +1628,17 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { maxVregUsage = VregCurrentUsage; if (SregCurrentUsage > maxSregUsage) maxSregUsage = SregCurrentUsage; - DEBUG( - dbgs() << "Picking New Blocks\n"; - dbgs() << "Available: "; - for (SIScheduleBlock* Block : ReadyBlocks) - dbgs() << Block->getID() << ' '; - dbgs() << "\nCurrent Live:\n"; - for (unsigned Reg : LiveRegs) - dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; - dbgs() << '\n'; - dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; - dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n'; - ); + LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: "; + for (SIScheduleBlock *Block + : ReadyBlocks) dbgs() + << Block->getID() << ' '; + dbgs() << "\nCurrent Live:\n"; + for (unsigned Reg + : LiveRegs) dbgs() + << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; + dbgs() << '\n'; + dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; + dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';); Cand.Block = nullptr; for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(), @@ -1671,20 +1670,18 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { if (TryCand.Reason != NoCand) { Cand.setBest(TryCand); Best = I; - DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' - << getReasonStr(Cand.Reason) << '\n'); + LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' + << getReasonStr(Cand.Reason) << '\n'); } } - DEBUG( - dbgs() << "Picking: " << Cand.Block->getID() << '\n'; - dbgs() << "Is a block with high latency instruction: " - << (Cand.IsHighLatency ? "yes\n" : "no\n"); - dbgs() << "Position of last high latency dependency: " - << Cand.LastPosHighLatParentScheduled << '\n'; - dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n'; - dbgs() << '\n'; - ); + LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n'; + dbgs() << "Is a block with high latency instruction: " + << (Cand.IsHighLatency ? "yes\n" : "no\n"); + dbgs() << "Position of last high latency dependency: " + << Cand.LastPosHighLatParentScheduled << '\n'; + dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n'; + dbgs() << '\n';); Block = Cand.Block; ReadyBlocks.erase(Best); @@ -1933,13 +1930,10 @@ void SIScheduleDAGMI::schedule() { SmallVector<SUnit*, 8> TopRoots, BotRoots; SIScheduleBlockResult Best, Temp; - DEBUG(dbgs() << "Preparing Scheduling\n"); + LLVM_DEBUG(dbgs() << "Preparing Scheduling\n"); buildDAGWithRegPressure(); - DEBUG( - for(SUnit& SU : SUnits) - SU.dumpAll(this) - ); + LLVM_DEBUG(for (SUnit &SU : SUnits) SU.dumpAll(this)); topologicalSort(); findRootsAndBiasEdges(TopRoots, BotRoots); @@ -2041,15 +2035,15 @@ void SIScheduleDAGMI::schedule() scheduleMI(SU, true); - DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " - << *SU->getInstr()); + LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " + << *SU->getInstr()); } assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); placeDebugValues(); - DEBUG({ + LLVM_DEBUG({ dbgs() << "*** Final schedule for " << printMBBReference(*begin()->getParent()) << " ***\n"; dumpSchedule(); diff --git a/lib/Target/AMDGPU/SIMachineScheduler.h b/lib/Target/AMDGPU/SIMachineScheduler.h index d824e38504e6..0ce68ac6a897 100644 --- a/lib/Target/AMDGPU/SIMachineScheduler.h +++ b/lib/Target/AMDGPU/SIMachineScheduler.h @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief SI Machine Scheduler interface +/// SI Machine Scheduler interface // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index c73fb10b7ea0..938cdaf1ef8f 100644 --- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Memory legalizer - implements memory model. More information can be +/// Memory legalizer - implements memory model. More information can be /// found here: /// http://llvm.org/docs/AMDGPUUsage.html#memory-model // @@ -19,7 +19,9 @@ #include "AMDGPUSubtarget.h" #include "SIDefines.h" #include "SIInstrInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/BitmaskEnum.h" #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -36,6 +38,7 @@ #include "llvm/MC/MCInstrDesc.h" #include "llvm/Pass.h" #include "llvm/Support/AtomicOrdering.h" +#include "llvm/Support/MathExtras.h" #include <cassert> #include <list> @@ -47,42 +50,142 @@ using namespace llvm::AMDGPU; namespace { +LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); + +/// Memory operation flags. Can be ORed together. +enum class SIMemOp { + NONE = 0u, + LOAD = 1u << 0, + STORE = 1u << 1, + LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE) +}; + +/// Position to insert a new instruction relative to an existing +/// instruction. +enum class Position { + BEFORE, + AFTER +}; + +/// The atomic synchronization scopes supported by the AMDGPU target. +enum class SIAtomicScope { + NONE, + SINGLETHREAD, + WAVEFRONT, + WORKGROUP, + AGENT, + SYSTEM +}; + +/// The distinct address spaces supported by the AMDGPU target for +/// atomic memory operation. Can be ORed toether. +enum class SIAtomicAddrSpace { + NONE = 0u, + GLOBAL = 1u << 0, + LDS = 1u << 1, + SCRATCH = 1u << 2, + GDS = 1u << 3, + OTHER = 1u << 4, + + /// The address spaces that can be accessed by a FLAT instruction. + FLAT = GLOBAL | LDS | SCRATCH, + + /// The address spaces that support atomic instructions. + ATOMIC = GLOBAL | LDS | SCRATCH | GDS, + + /// All address spaces. + ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER, + + LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) +}; + +/// Sets named bit \p BitName to "true" if present in instruction \p MI. +/// \returns Returns true if \p MI is modified, false otherwise. +template <uint16_t BitName> +bool enableNamedBit(const MachineBasicBlock::iterator &MI) { + int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName); + if (BitIdx == -1) + return false; + + MachineOperand &Bit = MI->getOperand(BitIdx); + if (Bit.getImm() != 0) + return false; + + Bit.setImm(1); + return true; +} + class SIMemOpInfo final { private: - SyncScope::ID SSID = SyncScope::System; + + friend class SIMemOpAccess; + AtomicOrdering Ordering = AtomicOrdering::NotAtomic; AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; + SIAtomicScope Scope = SIAtomicScope::SYSTEM; + SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; + SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; + bool IsCrossAddressSpaceOrdering = false; bool IsNonTemporal = false; - SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering) - : SSID(SSID), Ordering(Ordering) {} - - SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering, - AtomicOrdering FailureOrdering, bool IsNonTemporal = false) - : SSID(SSID), Ordering(Ordering), FailureOrdering(FailureOrdering), - IsNonTemporal(IsNonTemporal) {} - - /// \returns Info constructed from \p MI, which has at least machine memory - /// operand. - static Optional<SIMemOpInfo> constructFromMIWithMMO( - const MachineBasicBlock::iterator &MI); + SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, + SIAtomicScope Scope = SIAtomicScope::SYSTEM, + SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC, + SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL, + bool IsCrossAddressSpaceOrdering = true, + AtomicOrdering FailureOrdering = + AtomicOrdering::SequentiallyConsistent, + bool IsNonTemporal = false) + : Ordering(Ordering), FailureOrdering(FailureOrdering), + Scope(Scope), OrderingAddrSpace(OrderingAddrSpace), + InstrAddrSpace(InstrAddrSpace), + IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering), + IsNonTemporal(IsNonTemporal) { + // There is also no cross address space ordering if the ordering + // address space is the same as the instruction address space and + // only contains a single address space. + if ((OrderingAddrSpace == InstrAddrSpace) && + isPowerOf2_32(uint32_t(InstrAddrSpace))) + IsCrossAddressSpaceOrdering = false; + } public: - /// \returns Synchronization scope ID of the machine instruction used to + /// \returns Atomic synchronization scope of the machine instruction used to /// create this SIMemOpInfo. - SyncScope::ID getSSID() const { - return SSID; + SIAtomicScope getScope() const { + return Scope; } + /// \returns Ordering constraint of the machine instruction used to /// create this SIMemOpInfo. AtomicOrdering getOrdering() const { return Ordering; } + /// \returns Failure ordering constraint of the machine instruction used to /// create this SIMemOpInfo. AtomicOrdering getFailureOrdering() const { return FailureOrdering; } + + /// \returns The address spaces be accessed by the machine + /// instruction used to create this SiMemOpInfo. + SIAtomicAddrSpace getInstrAddrSpace() const { + return InstrAddrSpace; + } + + /// \returns The address spaces that must be ordered by the machine + /// instruction used to create this SiMemOpInfo. + SIAtomicAddrSpace getOrderingAddrSpace() const { + return OrderingAddrSpace; + } + + /// \returns Return true iff memory ordering of operations on + /// different address spaces is required. + bool getIsCrossAddressSpaceOrdering() const { + return IsCrossAddressSpaceOrdering; + } + /// \returns True if memory access of the machine instruction used to /// create this SIMemOpInfo is non-temporal, false otherwise. bool isNonTemporal() const { @@ -95,109 +198,198 @@ public: return Ordering != AtomicOrdering::NotAtomic; } +}; + +class SIMemOpAccess final { +private: + + AMDGPUAS SIAddrSpaceInfo; + AMDGPUMachineModuleInfo *MMI = nullptr; + + /// Reports unsupported message \p Msg for \p MI to LLVM context. + void reportUnsupported(const MachineBasicBlock::iterator &MI, + const char *Msg) const; + + /// Inspects the target synchonization scope \p SSID and determines + /// the SI atomic scope it corresponds to, the address spaces it + /// covers, and whether the memory ordering applies between address + /// spaces. + Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> + toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const; + + /// \return Return a bit set of the address spaces accessed by \p AS. + SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const; + + /// \returns Info constructed from \p MI, which has at least machine memory + /// operand. + Optional<SIMemOpInfo> constructFromMIWithMMO( + const MachineBasicBlock::iterator &MI) const; + +public: + /// Construct class to support accessing the machine memory operands + /// of instructions in the machine function \p MF. + SIMemOpAccess(MachineFunction &MF); + /// \returns Load info if \p MI is a load operation, "None" otherwise. - static Optional<SIMemOpInfo> getLoadInfo( - const MachineBasicBlock::iterator &MI); + Optional<SIMemOpInfo> getLoadInfo( + const MachineBasicBlock::iterator &MI) const; + /// \returns Store info if \p MI is a store operation, "None" otherwise. - static Optional<SIMemOpInfo> getStoreInfo( - const MachineBasicBlock::iterator &MI); + Optional<SIMemOpInfo> getStoreInfo( + const MachineBasicBlock::iterator &MI) const; + /// \returns Atomic fence info if \p MI is an atomic fence operation, /// "None" otherwise. - static Optional<SIMemOpInfo> getAtomicFenceInfo( - const MachineBasicBlock::iterator &MI); - /// \returns Atomic cmpxchg info if \p MI is an atomic cmpxchg operation, - /// "None" otherwise. - static Optional<SIMemOpInfo> getAtomicCmpxchgInfo( - const MachineBasicBlock::iterator &MI); - /// \returns Atomic rmw info if \p MI is an atomic rmw operation, - /// "None" otherwise. - static Optional<SIMemOpInfo> getAtomicRmwInfo( - const MachineBasicBlock::iterator &MI); + Optional<SIMemOpInfo> getAtomicFenceInfo( + const MachineBasicBlock::iterator &MI) const; - /// \brief Reports unknown synchronization scope used in \p MI to LLVM - /// context. - static void reportUnknownSyncScope( - const MachineBasicBlock::iterator &MI); + /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or + /// rmw operation, "None" otherwise. + Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo( + const MachineBasicBlock::iterator &MI) const; }; -class SIMemoryLegalizer final : public MachineFunctionPass { -private: - /// \brief Machine module info. - const AMDGPUMachineModuleInfo *MMI = nullptr; +class SICacheControl { +protected: - /// \brief Instruction info. + /// Instruction info. const SIInstrInfo *TII = nullptr; - /// \brief Immediate for "vmcnt(0)". - unsigned Vmcnt0Immediate = 0; + IsaInfo::IsaVersion IV; - /// \brief Opcode for cache invalidation instruction (L1). - unsigned Wbinvl1Opcode = 0; + SICacheControl(const GCNSubtarget &ST); - /// \brief List of atomic pseudo instructions. - std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; +public: - /// \brief Sets named bit (BitName) to "true" if present in \p MI. Returns - /// true if \p MI is modified, false otherwise. - template <uint16_t BitName> - bool enableNamedBit(const MachineBasicBlock::iterator &MI) const { - int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName); - if (BitIdx == -1) - return false; + /// Create a cache control for the subtarget \p ST. + static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST); + + /// Update \p MI memory load instruction to bypass any caches up to + /// the \p Scope memory scope for address spaces \p + /// AddrSpace. Return true iff the instruction was modified. + virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const = 0; + + /// Update \p MI memory instruction to indicate it is + /// nontemporal. Return true iff the instruction was modified. + virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI) + const = 0; + + /// Inserts any necessary instructions at position \p Pos relative + /// to instruction \p MI to ensure any caches associated with + /// address spaces \p AddrSpace for memory scopes up to memory scope + /// \p Scope are invalidated. Returns true iff any instructions + /// inserted. + virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const = 0; + + /// Inserts any necessary instructions at position \p Pos relative + /// to instruction \p MI to ensure memory instructions of kind \p Op + /// associated with address spaces \p AddrSpace have completed as + /// observed by other memory instructions executing in memory scope + /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory + /// ordering is between address spaces. Returns true iff any + /// instructions inserted. + virtual bool insertWait(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + SIMemOp Op, + bool IsCrossAddrSpaceOrdering, + Position Pos) const = 0; + + /// Virtual destructor to allow derivations to be deleted. + virtual ~SICacheControl() = default; - MachineOperand &Bit = MI->getOperand(BitIdx); - if (Bit.getImm() != 0) - return false; +}; - Bit.setImm(1); - return true; - } +class SIGfx6CacheControl : public SICacheControl { +protected: - /// \brief Sets GLC bit to "true" if present in \p MI. Returns true if \p MI + /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI /// is modified, false otherwise. bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { return enableNamedBit<AMDGPU::OpName::glc>(MI); } - /// \brief Sets SLC bit to "true" if present in \p MI. Returns true if \p MI + /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI /// is modified, false otherwise. bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { return enableNamedBit<AMDGPU::OpName::slc>(MI); } - /// \brief Inserts "buffer_wbinvl1_vol" instruction \p Before or after \p MI. - /// Always returns true. - bool insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI, - bool Before = true) const; - /// \brief Inserts "s_waitcnt vmcnt(0)" instruction \p Before or after \p MI. - /// Always returns true. - bool insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI, - bool Before = true) const; +public: + + SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}; + + bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const override; + + bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override; + + bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const override; + + bool insertWait(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + SIMemOp Op, + bool IsCrossAddrSpaceOrdering, + Position Pos) const override; +}; + +class SIGfx7CacheControl : public SIGfx6CacheControl { +public: - /// \brief Removes all processed atomic pseudo instructions from the current + SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}; + + bool insertCacheInvalidate(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const override; + +}; + +class SIMemoryLegalizer final : public MachineFunctionPass { +private: + + /// Cache Control. + std::unique_ptr<SICacheControl> CC = nullptr; + + /// List of atomic pseudo instructions. + std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; + + /// Return true iff instruction \p MI is a atomic instruction that + /// returns a result. + bool isAtomicRet(const MachineInstr &MI) const { + return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1; + } + + /// Removes all processed atomic pseudo instructions from the current /// function. Returns true if current function is modified, false otherwise. bool removeAtomicPseudoMIs(); - /// \brief Expands load operation \p MI. Returns true if instructions are + /// Expands load operation \p MI. Returns true if instructions are /// added/deleted or \p MI is modified, false otherwise. bool expandLoad(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI); - /// \brief Expands store operation \p MI. Returns true if instructions are + /// Expands store operation \p MI. Returns true if instructions are /// added/deleted or \p MI is modified, false otherwise. bool expandStore(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI); - /// \brief Expands atomic fence operation \p MI. Returns true if + /// Expands atomic fence operation \p MI. Returns true if /// instructions are added/deleted or \p MI is modified, false otherwise. bool expandAtomicFence(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI); - /// \brief Expands atomic cmpxchg operation \p MI. Returns true if - /// instructions are added/deleted or \p MI is modified, false otherwise. - bool expandAtomicCmpxchg(const SIMemOpInfo &MOI, - MachineBasicBlock::iterator &MI); - /// \brief Expands atomic rmw operation \p MI. Returns true if + /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if /// instructions are added/deleted or \p MI is modified, false otherwise. - bool expandAtomicRmw(const SIMemOpInfo &MOI, - MachineBasicBlock::iterator &MI); + bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, + MachineBasicBlock::iterator &MI); public: static char ID; @@ -218,48 +410,129 @@ public: } // end namespace anonymous -/* static */ -Optional<SIMemOpInfo> SIMemOpInfo::constructFromMIWithMMO( - const MachineBasicBlock::iterator &MI) { - assert(MI->getNumMemOperands() > 0); +void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI, + const char *Msg) const { + const Function &Func = MI->getParent()->getParent()->getFunction(); + DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc()); + Func.getContext().diagnose(Diag); +} - const MachineFunction *MF = MI->getParent()->getParent(); - const AMDGPUMachineModuleInfo *MMI = - &MF->getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); +Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>> +SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, + SIAtomicAddrSpace InstrScope) const { + /// TODO: For now assume OpenCL memory model which treats each + /// address space as having a separate happens-before relation, and + /// so an instruction only has ordering with respect to the address + /// space it accesses, and if it accesses multiple address spaces it + /// does not require ordering of operations in different address + /// spaces. + if (SSID == SyncScope::System) + return std::make_tuple(SIAtomicScope::SYSTEM, + SIAtomicAddrSpace::ATOMIC & InstrScope, + false); + if (SSID == MMI->getAgentSSID()) + return std::make_tuple(SIAtomicScope::AGENT, + SIAtomicAddrSpace::ATOMIC & InstrScope, + false); + if (SSID == MMI->getWorkgroupSSID()) + return std::make_tuple(SIAtomicScope::WORKGROUP, + SIAtomicAddrSpace::ATOMIC & InstrScope, + false); + if (SSID == MMI->getWavefrontSSID()) + return std::make_tuple(SIAtomicScope::WAVEFRONT, + SIAtomicAddrSpace::ATOMIC & InstrScope, + false); + if (SSID == SyncScope::SingleThread) + return std::make_tuple(SIAtomicScope::SINGLETHREAD, + SIAtomicAddrSpace::ATOMIC & InstrScope, + false); + /// TODO: To support HSA Memory Model need to add additional memory + /// scopes that specify that do require cross address space + /// ordering. + return None; +} + +SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { + if (AS == SIAddrSpaceInfo.FLAT_ADDRESS) + return SIAtomicAddrSpace::FLAT; + if (AS == SIAddrSpaceInfo.GLOBAL_ADDRESS) + return SIAtomicAddrSpace::GLOBAL; + if (AS == SIAddrSpaceInfo.LOCAL_ADDRESS) + return SIAtomicAddrSpace::LDS; + if (AS == SIAddrSpaceInfo.PRIVATE_ADDRESS) + return SIAtomicAddrSpace::SCRATCH; + if (AS == SIAddrSpaceInfo.REGION_ADDRESS) + return SIAtomicAddrSpace::GDS; + + return SIAtomicAddrSpace::OTHER; +} + +SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { + SIAddrSpaceInfo = getAMDGPUAS(MF.getTarget()); + MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); +} + +Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO( + const MachineBasicBlock::iterator &MI) const { + assert(MI->getNumMemOperands() > 0); SyncScope::ID SSID = SyncScope::SingleThread; AtomicOrdering Ordering = AtomicOrdering::NotAtomic; AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; + SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE; bool IsNonTemporal = true; // Validator should check whether or not MMOs cover the entire set of // locations accessed by the memory instruction. for (const auto &MMO : MI->memoperands()) { - const auto &IsSyncScopeInclusion = - MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); - if (!IsSyncScopeInclusion) { - reportUnknownSyncScope(MI); - return None; - } - - SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); - Ordering = - isStrongerThan(Ordering, MMO->getOrdering()) ? - Ordering : MMO->getOrdering(); - FailureOrdering = - isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ? - FailureOrdering : MMO->getFailureOrdering(); + IsNonTemporal &= MMO->isNonTemporal(); + InstrAddrSpace |= + toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace()); + AtomicOrdering OpOrdering = MMO->getOrdering(); + if (OpOrdering != AtomicOrdering::NotAtomic) { + const auto &IsSyncScopeInclusion = + MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); + if (!IsSyncScopeInclusion) { + reportUnsupported(MI, + "Unsupported non-inclusive atomic synchronization scope"); + return None; + } - if (!(MMO->getFlags() & MachineMemOperand::MONonTemporal)) - IsNonTemporal = false; + SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); + Ordering = + isStrongerThan(Ordering, OpOrdering) ? + Ordering : MMO->getOrdering(); + assert(MMO->getFailureOrdering() != AtomicOrdering::Release && + MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease); + FailureOrdering = + isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ? + FailureOrdering : MMO->getFailureOrdering(); + } } - return SIMemOpInfo(SSID, Ordering, FailureOrdering, IsNonTemporal); + SIAtomicScope Scope = SIAtomicScope::NONE; + SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; + bool IsCrossAddressSpaceOrdering = false; + if (Ordering != AtomicOrdering::NotAtomic) { + auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace); + if (!ScopeOrNone) { + reportUnsupported(MI, "Unsupported atomic synchronization scope"); + return None; + } + std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = + ScopeOrNone.getValue(); + if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || + ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { + reportUnsupported(MI, "Unsupported atomic address space"); + return None; + } + } + return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace, + IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal); } -/* static */ -Optional<SIMemOpInfo> SIMemOpInfo::getLoadInfo( - const MachineBasicBlock::iterator &MI) { +Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo( + const MachineBasicBlock::iterator &MI) const { assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); if (!(MI->mayLoad() && !MI->mayStore())) @@ -267,15 +540,13 @@ Optional<SIMemOpInfo> SIMemOpInfo::getLoadInfo( // Be conservative if there are no memory operands. if (MI->getNumMemOperands() == 0) - return SIMemOpInfo(SyncScope::System, - AtomicOrdering::SequentiallyConsistent); + return SIMemOpInfo(); - return SIMemOpInfo::constructFromMIWithMMO(MI); + return constructFromMIWithMMO(MI); } -/* static */ -Optional<SIMemOpInfo> SIMemOpInfo::getStoreInfo( - const MachineBasicBlock::iterator &MI) { +Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo( + const MachineBasicBlock::iterator &MI) const { assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); if (!(!MI->mayLoad() && MI->mayStore())) @@ -283,30 +554,46 @@ Optional<SIMemOpInfo> SIMemOpInfo::getStoreInfo( // Be conservative if there are no memory operands. if (MI->getNumMemOperands() == 0) - return SIMemOpInfo(SyncScope::System, - AtomicOrdering::SequentiallyConsistent); + return SIMemOpInfo(); - return SIMemOpInfo::constructFromMIWithMMO(MI); + return constructFromMIWithMMO(MI); } -/* static */ -Optional<SIMemOpInfo> SIMemOpInfo::getAtomicFenceInfo( - const MachineBasicBlock::iterator &MI) { +Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo( + const MachineBasicBlock::iterator &MI) const { assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) return None; - SyncScope::ID SSID = - static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); AtomicOrdering Ordering = - static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); - return SIMemOpInfo(SSID, Ordering); + static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); + + SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); + auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC); + if (!ScopeOrNone) { + reportUnsupported(MI, "Unsupported atomic synchronization scope"); + return None; + } + + SIAtomicScope Scope = SIAtomicScope::NONE; + SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE; + bool IsCrossAddressSpaceOrdering = false; + std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) = + ScopeOrNone.getValue(); + + if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) || + ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) { + reportUnsupported(MI, "Unsupported atomic address space"); + return None; + } + + return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC, + IsCrossAddressSpaceOrdering); } -/* static */ -Optional<SIMemOpInfo> SIMemOpInfo::getAtomicCmpxchgInfo( - const MachineBasicBlock::iterator &MI) { +Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( + const MachineBasicBlock::iterator &MI) const { assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); if (!(MI->mayLoad() && MI->mayStore())) @@ -314,68 +601,251 @@ Optional<SIMemOpInfo> SIMemOpInfo::getAtomicCmpxchgInfo( // Be conservative if there are no memory operands. if (MI->getNumMemOperands() == 0) - return SIMemOpInfo(SyncScope::System, - AtomicOrdering::SequentiallyConsistent, - AtomicOrdering::SequentiallyConsistent); + return SIMemOpInfo(); - return SIMemOpInfo::constructFromMIWithMMO(MI); + return constructFromMIWithMMO(MI); +} + +SICacheControl::SICacheControl(const GCNSubtarget &ST) { + TII = ST.getInstrInfo(); + IV = IsaInfo::getIsaVersion(ST.getFeatureBits()); } /* static */ -Optional<SIMemOpInfo> SIMemOpInfo::getAtomicRmwInfo( - const MachineBasicBlock::iterator &MI) { - assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); +std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { + GCNSubtarget::Generation Generation = ST.getGeneration(); + if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS) + return make_unique<SIGfx6CacheControl>(ST); + return make_unique<SIGfx7CacheControl>(ST); +} - if (!(MI->mayLoad() && MI->mayStore())) - return None; +bool SIGfx6CacheControl::enableLoadCacheBypass( + const MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace) const { + assert(MI->mayLoad() && !MI->mayStore()); + bool Changed = false; - // Be conservative if there are no memory operands. - if (MI->getNumMemOperands() == 0) - return SIMemOpInfo(SyncScope::System, - AtomicOrdering::SequentiallyConsistent); + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + /// TODO: Do not set glc for rmw atomic operations as they + /// implicitly bypass the L1 cache. - return SIMemOpInfo::constructFromMIWithMMO(MI); + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + Changed |= enableGLCBit(MI); + break; + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to bypass. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory caches + /// to be bypassed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not hava a cache. + + return Changed; } -/* static */ -void SIMemOpInfo::reportUnknownSyncScope( - const MachineBasicBlock::iterator &MI) { - DiagnosticInfoUnsupported Diag(MI->getParent()->getParent()->getFunction(), - "Unsupported synchronization scope"); - LLVMContext *CTX = &MI->getParent()->getParent()->getFunction().getContext(); - CTX->diagnose(Diag); +bool SIGfx6CacheControl::enableNonTemporal( + const MachineBasicBlock::iterator &MI) const { + assert(MI->mayLoad() ^ MI->mayStore()); + bool Changed = false; + + /// TODO: Do not enableGLCBit if rmw atomic. + Changed |= enableGLCBit(MI); + Changed |= enableSLCBit(MI); + + return Changed; } -bool SIMemoryLegalizer::insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI, - bool Before) const { +bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const { + bool Changed = false; + MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); - if (!Before) + if (Pos == Position::AFTER) ++MI; - BuildMI(MBB, MI, DL, TII->get(Wbinvl1Opcode)); + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1)); + Changed = true; + break; + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to invalidate. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory cache + /// to be flushed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not hava a cache. - if (!Before) + if (Pos == Position::AFTER) --MI; - return true; + return Changed; } -bool SIMemoryLegalizer::insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI, - bool Before) const { +bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + SIMemOp Op, + bool IsCrossAddrSpaceOrdering, + Position Pos) const { + bool Changed = false; + MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); - if (!Before) + if (Pos == Position::AFTER) ++MI; - BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Vmcnt0Immediate); + bool VMCnt = false; + bool LGKMCnt = false; + bool EXPCnt = false; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + VMCnt = true; + break; + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // The L1 cache keeps all memory operations in order for + // wavefronts in the same work-group. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + case SIAtomicScope::WORKGROUP: + // If no cross address space ordering then an LDS waitcnt is not + // needed as LDS operations for all waves are executed in a + // total global ordering as observed by all waves. Required if + // also synchronizing with global/GDS memory as LDS operations + // could be reordered with respect to later global/GDS memory + // operations of the same wave. + LGKMCnt = IsCrossAddrSpaceOrdering; + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // The LDS keeps all memory operations in order for + // the same wavesfront. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + // If no cross address space ordering then an GDS waitcnt is not + // needed as GDS operations for all waves are executed in a + // total global ordering as observed by all waves. Required if + // also synchronizing with global/LDS memory as GDS operations + // could be reordered with respect to later global/LDS memory + // operations of the same wave. + EXPCnt = IsCrossAddrSpaceOrdering; + break; + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // The GDS keeps all memory operations in order for + // the same work-group. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } - if (!Before) + if (VMCnt || LGKMCnt || EXPCnt) { + unsigned WaitCntImmediate = + AMDGPU::encodeWaitcnt(IV, + VMCnt ? 0 : getVmcntBitMask(IV), + EXPCnt ? 0 : getExpcntBitMask(IV), + LGKMCnt ? 0 : getLgkmcntBitMask(IV)); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate); + Changed = true; + } + + if (Pos == Position::AFTER) --MI; - return true; + return Changed; +} + +bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const { + bool Changed = false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (Pos == Position::AFTER) + ++MI; + + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + switch (Scope) { + case SIAtomicScope::SYSTEM: + case SIAtomicScope::AGENT: + BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1_VOL)); + Changed = true; + break; + case SIAtomicScope::WORKGROUP: + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to invalidate. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + } + + /// The scratch address space does not need the global memory cache + /// to be flushed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not hava a cache. + + if (Pos == Position::AFTER) + --MI; + + return Changed; } bool SIMemoryLegalizer::removeAtomicPseudoMIs() { @@ -396,37 +866,38 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, bool Changed = false; if (MOI.isAtomic()) { - if (MOI.getSSID() == SyncScope::System || - MOI.getSSID() == MMI->getAgentSSID()) { - if (MOI.getOrdering() == AtomicOrdering::Acquire || - MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) - Changed |= enableGLCBit(MI); - - if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) - Changed |= insertWaitcntVmcnt0(MI); - - if (MOI.getOrdering() == AtomicOrdering::Acquire || - MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { - Changed |= insertWaitcntVmcnt0(MI, false); - Changed |= insertBufferWbinvl1Vol(MI, false); - } - - return Changed; + if (MOI.getOrdering() == AtomicOrdering::Monotonic || + MOI.getOrdering() == AtomicOrdering::Acquire || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { + Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(), + MOI.getOrderingAddrSpace()); } - if (MOI.getSSID() == SyncScope::SingleThread || - MOI.getSSID() == MMI->getWorkgroupSSID() || - MOI.getSSID() == MMI->getWavefrontSSID()) { - return Changed; + if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) + Changed |= CC->insertWait(MI, MOI.getScope(), + MOI.getOrderingAddrSpace(), + SIMemOp::LOAD | SIMemOp::STORE, + MOI.getIsCrossAddressSpaceOrdering(), + Position::BEFORE); + + if (MOI.getOrdering() == AtomicOrdering::Acquire || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { + Changed |= CC->insertWait(MI, MOI.getScope(), + MOI.getInstrAddrSpace(), + SIMemOp::LOAD, + MOI.getIsCrossAddressSpaceOrdering(), + Position::AFTER); + Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(), + MOI.getOrderingAddrSpace(), + Position::AFTER); } - llvm_unreachable("Unsupported synchronization scope"); + return Changed; } // Atomic instructions do not have the nontemporal attribute. if (MOI.isNonTemporal()) { - Changed |= enableGLCBit(MI); - Changed |= enableSLCBit(MI); + Changed |= CC->enableNonTemporal(MI); return Changed; } @@ -440,28 +911,20 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, bool Changed = false; if (MOI.isAtomic()) { - if (MOI.getSSID() == SyncScope::System || - MOI.getSSID() == MMI->getAgentSSID()) { - if (MOI.getOrdering() == AtomicOrdering::Release || - MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) - Changed |= insertWaitcntVmcnt0(MI); + if (MOI.getOrdering() == AtomicOrdering::Release || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) + Changed |= CC->insertWait(MI, MOI.getScope(), + MOI.getOrderingAddrSpace(), + SIMemOp::LOAD | SIMemOp::STORE, + MOI.getIsCrossAddressSpaceOrdering(), + Position::BEFORE); - return Changed; - } - - if (MOI.getSSID() == SyncScope::SingleThread || - MOI.getSSID() == MMI->getWorkgroupSSID() || - MOI.getSSID() == MMI->getWavefrontSSID()) { - return Changed; - } - - llvm_unreachable("Unsupported synchronization scope"); + return Changed; } // Atomic instructions do not have the nontemporal attribute. if (MOI.isNonTemporal()) { - Changed |= enableGLCBit(MI); - Changed |= enableSLCBit(MI); + Changed |= CC->enableNonTemporal(MI); return Changed; } @@ -472,111 +935,74 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI) { assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); + AtomicPseudoMIs.push_back(MI); bool Changed = false; if (MOI.isAtomic()) { - if (MOI.getSSID() == SyncScope::System || - MOI.getSSID() == MMI->getAgentSSID()) { - if (MOI.getOrdering() == AtomicOrdering::Acquire || - MOI.getOrdering() == AtomicOrdering::Release || - MOI.getOrdering() == AtomicOrdering::AcquireRelease || - MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) - Changed |= insertWaitcntVmcnt0(MI); - - if (MOI.getOrdering() == AtomicOrdering::Acquire || - MOI.getOrdering() == AtomicOrdering::AcquireRelease || - MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) - Changed |= insertBufferWbinvl1Vol(MI); - - AtomicPseudoMIs.push_back(MI); - return Changed; - } + if (MOI.getOrdering() == AtomicOrdering::Acquire || + MOI.getOrdering() == AtomicOrdering::Release || + MOI.getOrdering() == AtomicOrdering::AcquireRelease || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) + /// TODO: This relies on a barrier always generating a waitcnt + /// for LDS to ensure it is not reordered with the completion of + /// the proceeding LDS operations. If barrier had a memory + /// ordering and memory scope, then library does not need to + /// generate a fence. Could add support in this file for + /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally + /// adding waitcnt before a S_BARRIER. + Changed |= CC->insertWait(MI, MOI.getScope(), + MOI.getOrderingAddrSpace(), + SIMemOp::LOAD | SIMemOp::STORE, + MOI.getIsCrossAddressSpaceOrdering(), + Position::BEFORE); + + if (MOI.getOrdering() == AtomicOrdering::Acquire || + MOI.getOrdering() == AtomicOrdering::AcquireRelease || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) + Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(), + MOI.getOrderingAddrSpace(), + Position::BEFORE); - if (MOI.getSSID() == SyncScope::SingleThread || - MOI.getSSID() == MMI->getWorkgroupSSID() || - MOI.getSSID() == MMI->getWavefrontSSID()) { - AtomicPseudoMIs.push_back(MI); - return Changed; - } - - SIMemOpInfo::reportUnknownSyncScope(MI); - } - - return Changed; -} - -bool SIMemoryLegalizer::expandAtomicCmpxchg(const SIMemOpInfo &MOI, - MachineBasicBlock::iterator &MI) { - assert(MI->mayLoad() && MI->mayStore()); - - bool Changed = false; - - if (MOI.isAtomic()) { - if (MOI.getSSID() == SyncScope::System || - MOI.getSSID() == MMI->getAgentSSID()) { - if (MOI.getOrdering() == AtomicOrdering::Release || - MOI.getOrdering() == AtomicOrdering::AcquireRelease || - MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || - MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) - Changed |= insertWaitcntVmcnt0(MI); - - if (MOI.getOrdering() == AtomicOrdering::Acquire || - MOI.getOrdering() == AtomicOrdering::AcquireRelease || - MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || - MOI.getFailureOrdering() == AtomicOrdering::Acquire || - MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { - Changed |= insertWaitcntVmcnt0(MI, false); - Changed |= insertBufferWbinvl1Vol(MI, false); - } - - return Changed; - } - - if (MOI.getSSID() == SyncScope::SingleThread || - MOI.getSSID() == MMI->getWorkgroupSSID() || - MOI.getSSID() == MMI->getWavefrontSSID()) { - Changed |= enableGLCBit(MI); - return Changed; - } - - llvm_unreachable("Unsupported synchronization scope"); + return Changed; } return Changed; } -bool SIMemoryLegalizer::expandAtomicRmw(const SIMemOpInfo &MOI, - MachineBasicBlock::iterator &MI) { +bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, + MachineBasicBlock::iterator &MI) { assert(MI->mayLoad() && MI->mayStore()); bool Changed = false; if (MOI.isAtomic()) { - if (MOI.getSSID() == SyncScope::System || - MOI.getSSID() == MMI->getAgentSSID()) { - if (MOI.getOrdering() == AtomicOrdering::Release || - MOI.getOrdering() == AtomicOrdering::AcquireRelease || - MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) - Changed |= insertWaitcntVmcnt0(MI); - - if (MOI.getOrdering() == AtomicOrdering::Acquire || - MOI.getOrdering() == AtomicOrdering::AcquireRelease || - MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { - Changed |= insertWaitcntVmcnt0(MI, false); - Changed |= insertBufferWbinvl1Vol(MI, false); - } - - return Changed; + if (MOI.getOrdering() == AtomicOrdering::Release || + MOI.getOrdering() == AtomicOrdering::AcquireRelease || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || + MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) + Changed |= CC->insertWait(MI, MOI.getScope(), + MOI.getOrderingAddrSpace(), + SIMemOp::LOAD | SIMemOp::STORE, + MOI.getIsCrossAddressSpaceOrdering(), + Position::BEFORE); + + if (MOI.getOrdering() == AtomicOrdering::Acquire || + MOI.getOrdering() == AtomicOrdering::AcquireRelease || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || + MOI.getFailureOrdering() == AtomicOrdering::Acquire || + MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { + Changed |= CC->insertWait(MI, MOI.getScope(), + MOI.getOrderingAddrSpace(), + isAtomicRet(*MI) ? SIMemOp::LOAD : + SIMemOp::STORE, + MOI.getIsCrossAddressSpaceOrdering(), + Position::AFTER); + Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(), + MOI.getOrderingAddrSpace(), + Position::AFTER); } - if (MOI.getSSID() == SyncScope::SingleThread || - MOI.getSSID() == MMI->getWorkgroupSSID() || - MOI.getSSID() == MMI->getWavefrontSSID()) { - Changed |= enableGLCBit(MI); - return Changed; - } - - llvm_unreachable("Unsupported synchronization scope"); + return Changed; } return Changed; @@ -584,32 +1010,23 @@ bool SIMemoryLegalizer::expandAtomicRmw(const SIMemOpInfo &MOI, bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - const IsaInfo::IsaVersion IV = IsaInfo::getIsaVersion(ST.getFeatureBits()); - - MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); - TII = ST.getInstrInfo(); - Vmcnt0Immediate = - AMDGPU::encodeWaitcnt(IV, 0, getExpcntBitMask(IV), getLgkmcntBitMask(IV)); - Wbinvl1Opcode = ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS ? - AMDGPU::BUFFER_WBINVL1 : AMDGPU::BUFFER_WBINVL1_VOL; + SIMemOpAccess MOA(MF); + CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); for (auto &MBB : MF) { for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) continue; - if (const auto &MOI = SIMemOpInfo::getLoadInfo(MI)) + if (const auto &MOI = MOA.getLoadInfo(MI)) Changed |= expandLoad(MOI.getValue(), MI); - else if (const auto &MOI = SIMemOpInfo::getStoreInfo(MI)) + else if (const auto &MOI = MOA.getStoreInfo(MI)) Changed |= expandStore(MOI.getValue(), MI); - else if (const auto &MOI = SIMemOpInfo::getAtomicFenceInfo(MI)) + else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) Changed |= expandAtomicFence(MOI.getValue(), MI); - else if (const auto &MOI = SIMemOpInfo::getAtomicCmpxchgInfo(MI)) - Changed |= expandAtomicCmpxchg(MOI.getValue(), MI); - else if (const auto &MOI = SIMemOpInfo::getAtomicRmwInfo(MI)) - Changed |= expandAtomicRmw(MOI.getValue(), MI); + else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) + Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI); } } diff --git a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index 2dc6f2702b3b..ebcad30a1866 100644 --- a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -10,6 +10,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -76,7 +77,7 @@ static unsigned isCopyToExec(const MachineInstr &MI) { case AMDGPU::COPY: case AMDGPU::S_MOV_B64: { const MachineOperand &Dst = MI.getOperand(0); - if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC) + if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC && MI.getOperand(1).isReg()) return MI.getOperand(1).getReg(); break; } @@ -208,7 +209,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -243,11 +244,11 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec if (CopyToExecInst->getOperand(1).isKill() && isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) { - DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst); + LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst); PrepareExecInst->getOperand(0).setReg(AMDGPU::EXEC); - DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n'); + LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n'); CopyToExecInst->eraseFromParent(); } @@ -257,7 +258,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { if (isLiveOut(MBB, CopyToExec)) { // The copied register is live out and has a second use in another block. - DEBUG(dbgs() << "Exec copy source register is live out\n"); + LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n"); continue; } @@ -269,7 +270,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator(); J != JE; ++J) { if (SaveExecInst && J->readsRegister(AMDGPU::EXEC, TRI)) { - DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); + LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n'); // Make sure this is inserted after any VALU ops that may have been // scheduled in between. SaveExecInst = nullptr; @@ -280,8 +281,8 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { if (J->modifiesRegister(CopyToExec, TRI)) { if (SaveExecInst) { - DEBUG(dbgs() << "Multiple instructions modify " - << printReg(CopyToExec, TRI) << '\n'); + LLVM_DEBUG(dbgs() << "Multiple instructions modify " + << printReg(CopyToExec, TRI) << '\n'); SaveExecInst = nullptr; break; } @@ -292,10 +293,11 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { if (ReadsCopyFromExec) { SaveExecInst = &*J; - DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n'); + LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n'); continue; } else { - DEBUG(dbgs() << "Instruction does not read exec copy: " << *J << '\n'); + LLVM_DEBUG(dbgs() + << "Instruction does not read exec copy: " << *J << '\n'); break; } } else if (ReadsCopyFromExec && !SaveExecInst) { @@ -307,8 +309,8 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { // spill %sgpr0_sgpr1 // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1 // - DEBUG(dbgs() << "Found second use of save inst candidate: " - << *J << '\n'); + LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J + << '\n'); break; } @@ -321,7 +323,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { if (!SaveExecInst) continue; - DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n'); + LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n'); MachineOperand &Src0 = SaveExecInst->getOperand(1); MachineOperand &Src1 = SaveExecInst->getOperand(2); diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index 83074773c495..7b678d12ba81 100644 --- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief This pass removes redundant S_OR_B64 instructions enabling lanes in +/// This pass removes redundant S_OR_B64 instructions enabling lanes in /// the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any /// vector instructions between them we can only keep outer SI_END_CF, given /// that CFG is structured and exec bits of the outer end statement are always @@ -23,6 +23,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -106,7 +107,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -134,7 +135,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { } while (I != E) { - if (I->isDebugValue()) { + if (I->isDebugInstr()) { I = std::next(I); continue; } @@ -143,7 +144,8 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { I->hasUnmodeledSideEffects() || I->hasOrderedMemoryRef()) break; - DEBUG(dbgs() << "Removing no effect instruction: " << *I << '\n'); + LLVM_DEBUG(dbgs() + << "Removing no effect instruction: " << *I << '\n'); for (auto &Op : I->operands()) { if (Op.isReg()) @@ -193,7 +195,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { !getOrExecSource(*NextLead, *TII, MRI)) continue; - DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n'); + LLVM_DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n'); auto SaveExec = getOrExecSource(*Lead, *TII, MRI); unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII); @@ -224,7 +226,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { break; } - DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n'); + LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n'); } if (SafeToReplace) { diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 5ed7fdf220bf..0e000b72962e 100644 --- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -25,6 +25,7 @@ #include "SIDefines.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" @@ -39,6 +40,7 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/Config/llvm-config.h" #include "llvm/MC/LaneBitmask.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/Pass.h" @@ -86,11 +88,11 @@ public: } bool runOnMachineFunction(MachineFunction &MF) override; - void matchSDWAOperands(MachineFunction &MF); + void matchSDWAOperands(MachineBasicBlock &MBB); std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); - bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const; + bool isConvertibleToSDWA(const MachineInstr &MI, const GCNSubtarget &ST) const; bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); - void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const; + void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; StringRef getPassName() const override { return "SI Peephole SDWA"; } @@ -218,7 +220,7 @@ FunctionPass *llvm::createSIPeepholeSDWAPass() { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) { +static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { switch(Sel) { case BYTE_0: OS << "BYTE_0"; break; case BYTE_1: OS << "BYTE_1"; break; @@ -366,18 +368,53 @@ MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { // Find operand in instruction that matches source operand and replace it with // target operand. Set corresponding src_sel - + bool IsPreserveSrc = false; MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); MachineOperand *SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); assert(Src && (Src->isReg() || Src->isImm())); if (!isSameReg(*Src, *getReplacedOperand())) { - // If this is not src0 then it should be src1 + // If this is not src0 then it could be src1 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); + if (!Src || + !isSameReg(*Src, *getReplacedOperand())) { + // It's possible this Src is a tied operand for + // UNUSED_PRESERVE, in which case we can either + // abandon the peephole attempt, or if legal we can + // copy the target operand into the tied slot + // if the preserve operation will effectively cause the same + // result by overwriting the rest of the dst. + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + MachineOperand *DstUnused = + TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); + + if (Dst && + DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { + // This will work if the tied src is acessing WORD_0, and the dst is + // writing WORD_1. Modifiers don't matter because all the bits that + // would be impacted are being overwritten by the dst. + // Any other case will not work. + SdwaSel DstSel = static_cast<SdwaSel>( + TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel)); + if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && + getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { + IsPreserveSrc = true; + auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::vdst); + auto TiedIdx = MI.findTiedOperandIdx(DstIdx); + Src = &MI.getOperand(TiedIdx); + SrcSel = nullptr; + SrcMods = nullptr; + } else { + // Not legal to convert this src + return false; + } + } + } assert(Src && Src->isReg()); if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || @@ -388,11 +425,14 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { return false; } - assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods); + assert(isSameReg(*Src, *getReplacedOperand()) && + (IsPreserveSrc || (SrcSel && SrcMods))); } copyRegOperand(*Src, *getTargetOperand()); - SrcSel->setImm(getSrcSel()); - SrcMods->setImm(getSrcMods(TII, Src)); + if (!IsPreserveSrc) { + SrcSel->setImm(getSrcSel()); + SrcMods->setImm(getSrcMods(TII, Src)); + } getTargetOperand()->setIsKill(false); return true; } @@ -661,7 +701,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (TRI->isPhysicalRegister(Src1->getReg()) || + if (TRI->isPhysicalRegister(ValSrc->getReg()) || TRI->isPhysicalRegister(Dst->getReg())) break; @@ -739,8 +779,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { // TODO: add support for non-SDWA instructions as OtherInst. // For now this only works with SDWA instructions. For regular instructions - // there is no way to determine if instruction write only 8/16/24-bit out of - // full register size and all registers are at min 32-bit wide. + // there is no way to determine if the instruction writes only 8/16/24-bit + // out of full register size and all registers are at min 32-bit wide. if (!TII->isSDWA(*OtherInst)) break; @@ -804,20 +844,18 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { return std::unique_ptr<SDWAOperand>(nullptr); } -void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) { - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - if (auto Operand = matchSDWAOperand(MI)) { - DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); - SDWAOperands[&MI] = std::move(Operand); - ++NumSDWAPatternsFound; - } +void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { + for (MachineInstr &MI : MBB) { + if (auto Operand = matchSDWAOperand(MI)) { + LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); + SDWAOperands[&MI] = std::move(Operand); + ++NumSDWAPatternsFound; } } } bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, - const SISubtarget &ST) const { + const GCNSubtarget &ST) const { // Check if this is already an SDWA instruction unsigned Opc = MI.getOpcode(); if (TII->isSDWA(Opc)) @@ -854,11 +892,18 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, Opc == AMDGPU::V_MAC_F32_e32)) return false; + // FIXME: has SDWA but require handling of implicit VCC use + if (Opc == AMDGPU::V_CNDMASK_B32_e32) + return false; + return true; } bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands) { + + LLVM_DEBUG(dbgs() << "Convert instruction:" << MI); + // Convert to sdwa int SDWAOpcode; unsigned Opcode = MI.getOpcode(); @@ -984,9 +1029,29 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, } } - // Apply all sdwa operand pattenrs + // Check for a preserved register that needs to be copied. + auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); + if (DstUnused && + DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { + // We expect, if we are here, that the instruction was already in it's SDWA form, + // with a tied operand. + assert(Dst && Dst->isTied()); + assert(Opcode == static_cast<unsigned int>(SDWAOpcode)); + // We also expect a vdst, since sdst can't preserve. + auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst); + assert(PreserveDstIdx != -1); + + auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx); + auto Tied = MI.getOperand(TiedIdx); + + SDWAInst.add(Tied); + SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1); + } + + // Apply all sdwa operand patterns. bool Converted = false; for (auto &Operand : SDWAOperands) { + LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand); // There should be no intesection between SDWA operands and potential MIs // e.g.: // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 @@ -1007,8 +1072,7 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, return false; } - DEBUG(dbgs() << "Convert instruction:" << MI - << "Into:" << *SDWAInst << '\n'); + LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n'); ++NumSDWAInstructionsPeepholed; MI.eraseFromParent(); @@ -1017,7 +1081,8 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, // If an instruction was converted to SDWA it should not have immediates or SGPR // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. -void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const { +void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, + const GCNSubtarget &ST) const { const MCInstrDesc &Desc = TII->get(MI.getOpcode()); unsigned ConstantBusCount = 0; for (MachineOperand &Op : MI.explicit_uses()) { @@ -1048,7 +1113,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget } bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (!ST.hasSDWA() || skipFunction(MF.getFunction())) return false; @@ -1058,35 +1123,36 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { TII = ST.getInstrInfo(); // Find all SDWA operands in MF. - bool Changed = false; bool Ret = false; - do { - matchSDWAOperands(MF); - - for (const auto &OperandPair : SDWAOperands) { - const auto &Operand = OperandPair.second; - MachineInstr *PotentialMI = Operand->potentialToConvert(TII); - if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { - PotentialMatches[PotentialMI].push_back(Operand.get()); + for (MachineBasicBlock &MBB : MF) { + bool Changed = false; + do { + matchSDWAOperands(MBB); + + for (const auto &OperandPair : SDWAOperands) { + const auto &Operand = OperandPair.second; + MachineInstr *PotentialMI = Operand->potentialToConvert(TII); + if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { + PotentialMatches[PotentialMI].push_back(Operand.get()); + } } - } - for (auto &PotentialPair : PotentialMatches) { - MachineInstr &PotentialMI = *PotentialPair.first; - convertToSDWA(PotentialMI, PotentialPair.second); - } - - PotentialMatches.clear(); - SDWAOperands.clear(); + for (auto &PotentialPair : PotentialMatches) { + MachineInstr &PotentialMI = *PotentialPair.first; + convertToSDWA(PotentialMI, PotentialPair.second); + } - Changed = !ConvertedInstructions.empty(); + PotentialMatches.clear(); + SDWAOperands.clear(); - if (Changed) - Ret = true; + Changed = !ConvertedInstructions.empty(); - while (!ConvertedInstructions.empty()) - legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); - } while (Changed); + if (Changed) + Ret = true; + while (!ConvertedInstructions.empty()) + legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); + } while (Changed); + } return Ret; } diff --git a/lib/Target/AMDGPU/SIProgramInfo.h b/lib/Target/AMDGPU/SIProgramInfo.h new file mode 100644 index 000000000000..383f6b575808 --- /dev/null +++ b/lib/Target/AMDGPU/SIProgramInfo.h @@ -0,0 +1,77 @@ +//===--- SIProgramInfo.h ----------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Defines struct to track resource usage for kernels and entry functions. +/// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H +#define LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H + +namespace llvm { + +/// Track resource usage for kernels / entry functions. +struct SIProgramInfo { + // Fields set in PGM_RSRC1 pm4 packet. + uint32_t VGPRBlocks = 0; + uint32_t SGPRBlocks = 0; + uint32_t Priority = 0; + uint32_t FloatMode = 0; + uint32_t Priv = 0; + uint32_t DX10Clamp = 0; + uint32_t DebugMode = 0; + uint32_t IEEEMode = 0; + uint64_t ScratchSize = 0; + + uint64_t ComputePGMRSrc1 = 0; + + // Fields set in PGM_RSRC2 pm4 packet. + uint32_t LDSBlocks = 0; + uint32_t ScratchBlocks = 0; + + uint64_t ComputePGMRSrc2 = 0; + + uint32_t NumVGPR = 0; + uint32_t NumSGPR = 0; + uint32_t LDSSize = 0; + bool FlatUsed = false; + + // Number of SGPRs that meets number of waves per execution unit request. + uint32_t NumSGPRsForWavesPerEU = 0; + + // Number of VGPRs that meets number of waves per execution unit request. + uint32_t NumVGPRsForWavesPerEU = 0; + + // Fixed SGPR number used to hold wave scratch offset for entire kernel + // execution, or std::numeric_limits<uint16_t>::max() if the register is not + // used or not known. + uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR = + std::numeric_limits<uint16_t>::max(); + + // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire + // kernel execution, or std::numeric_limits<uint16_t>::max() if the register + // is not used or not known. + uint16_t DebuggerPrivateSegmentBufferSGPR = + std::numeric_limits<uint16_t>::max(); + + // Whether there is recursion, dynamic allocas, indirect calls or some other + // reason there may be statically unknown stack usage. + bool DynamicCallStack = false; + + // Bonus information for debugging. + bool VCCUsed = false; + + SIProgramInfo() = default; +}; + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_SIPROGRAMINFO_H diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 65cdc13e03cd..624607f6ea54 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -8,14 +8,16 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief SI implementation of the TargetRegisterInfo class. +/// SI implementation of the TargetRegisterInfo class. // //===----------------------------------------------------------------------===// #include "SIRegisterInfo.h" +#include "AMDGPURegisterBankInfo.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/RegisterScavenging.h" @@ -54,7 +56,7 @@ static cl::opt<bool> EnableSpillSGPRToVGPR( cl::ReallyHidden, cl::init(true)); -SIRegisterInfo::SIRegisterInfo(const SISubtarget &ST) : +SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : AMDGPURegisterInfo(), SGPRPressureSets(getNumRegPressureSets()), VGPRPressureSets(getNumRegPressureSets()), @@ -101,17 +103,10 @@ SIRegisterInfo::SIRegisterInfo(const SISubtarget &ST) : VGPRSetID < NumRegPressureSets); } -void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const { - MCRegAliasIterator R(Reg, this, true); - - for (; R.isValid(); ++R) - Reserved.set(*R); -} - unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( const MachineFunction &MF) const { - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4; unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); @@ -136,7 +131,7 @@ static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( const MachineFunction &MF) const { - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF)); return AMDGPU::SGPR_32RegClass.getRegister(Reg); } @@ -163,6 +158,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE); reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT); + // Reserve xnack_mask registers - support is not implemented in Codegen. + reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK); + // Reserve Trap Handler registers - support is not implemented in Codegen. reserveRegisterTuples(Reserved, AMDGPU::TBA); reserveRegisterTuples(Reserved, AMDGPU::TMA); @@ -175,7 +173,7 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); @@ -255,7 +253,7 @@ bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( // create a virtual register for it during frame index elimination, so the // scavenger is directly needed. return MF.getFrameInfo().hasStackObjects() && - MF.getSubtarget<SISubtarget>().hasScalarStores() && + MF.getSubtarget<GCNSubtarget>().hasScalarStores() && MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs(); } @@ -310,7 +308,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, DL = Ins->getDebugLoc(); MachineFunction *MF = MBB->getParent(); - const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>(); + const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = Subtarget.getInstrInfo(); if (Offset == 0) { @@ -339,7 +337,7 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, MachineBasicBlock *MBB = MI.getParent(); MachineFunction *MF = MBB->getParent(); - const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>(); + const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = Subtarget.getInstrInfo(); #ifndef NDEBUG @@ -526,7 +524,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, RegScavenger *RS) const { MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MI->getParent()->getParent(); - const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const MachineFrameInfo &MFI = MF->getFrameInfo(); @@ -534,22 +532,29 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, const DebugLoc &DL = MI->getDebugLoc(); bool IsStore = Desc.mayStore(); - bool RanOutOfSGPRs = false; bool Scavenged = false; unsigned SOffset = ScratchOffsetReg; + const unsigned EltSize = 4; const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg); - unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / 32; - unsigned Size = NumSubRegs * 4; + unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT); + unsigned Size = NumSubRegs * EltSize; int64_t Offset = InstOffset + MFI.getObjectOffset(Index); - const int64_t OriginalImmOffset = Offset; + int64_t ScratchOffsetRegDelta = 0; unsigned Align = MFI.getObjectAlignment(Index); const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo(); - if (!isUInt<12>(Offset + Size)) { + assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset"); + + if (!isUInt<12>(Offset + Size - EltSize)) { SOffset = AMDGPU::NoRegister; + // We currently only support spilling VGPRs to EltSize boundaries, meaning + // we can simplify the adjustment of Offset here to just scale with + // WavefrontSize. + Offset *= ST.getWavefrontSize(); + // We don't have access to the register scavenger if this function is called // during PEI::scavengeFrameVirtualRegs(). if (RS) @@ -563,8 +568,8 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, // add the offset directly to the ScratchOffset register, and then // subtract the offset after the spill to return ScratchOffset to it's // original value. - RanOutOfSGPRs = true; SOffset = ScratchOffsetReg; + ScratchOffsetRegDelta = Offset; } else { Scavenged = true; } @@ -576,8 +581,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, Offset = 0; } - const unsigned EltSize = 4; - for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) { unsigned SubReg = NumSubRegs == 1 ? ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i)); @@ -609,11 +612,11 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState); } - if (RanOutOfSGPRs) { + if (ScratchOffsetRegDelta != 0) { // Subtract the offset we added to the ScratchOffset register. BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg) - .addReg(ScratchOffsetReg) - .addImm(OriginalImmOffset); + .addReg(ScratchOffsetReg) + .addImm(ScratchOffsetRegDelta); } } @@ -640,6 +643,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, MachineBasicBlock *MBB = MI->getParent(); MachineFunction *MF = MBB->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + DenseSet<unsigned> SGPRSpillVGPRDefinedSet; ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills = MFI->getSGPRToVGPRSpills(Index); @@ -648,7 +652,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, return false; MachineRegisterInfo &MRI = MF->getRegInfo(); - const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); unsigned SuperReg = MI->getOperand(0).getReg(); @@ -661,6 +665,10 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, if (SpillToSMEM && OnlyToVGPR) return false; + assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && + SuperReg != MFI->getFrameOffsetReg() && + SuperReg != MFI->getScratchWaveOffsetReg())); + assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); unsigned OffsetReg = AMDGPU::M0; @@ -736,11 +744,21 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, if (SpillToVGPR) { SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i]; + // During SGPR spilling to VGPR, determine if the VGPR is defined. The + // only circumstance in which we say it is undefined is when it is the + // first spill to this VGPR in the first basic block. + bool VGPRDefined = true; + if (MBB == &MF->front()) + VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second; + + // Mark the "old value of vgpr" input undef only if this is the first sgpr + // spill to this specific vgpr in the first basic block. BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), Spill.VGPR) .addReg(SubReg, getKillRegState(IsKill)) - .addImm(Spill.Lane); + .addImm(Spill.Lane) + .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef); // FIXME: Since this spills to another register instead of an actual // frame index, we should delete the frame index when all references to @@ -812,7 +830,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, return false; MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const DebugLoc &DL = MI->getDebugLoc(); @@ -972,7 +990,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); @@ -1051,8 +1069,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, // Convert to an absolute stack address by finding the offset from the // scratch wave base and scaling by the wave size. // - // In an entry function/kernel the stack address is already the absolute - // address relative to the the scratch wave offset. + // In an entry function/kernel the stack address is already the + // absolute address relative to the scratch wave offset. unsigned DiffReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); @@ -1219,6 +1237,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { &AMDGPU::VReg_512RegClass, &AMDGPU::SReg_512RegClass, &AMDGPU::SCC_CLASSRegClass, + &AMDGPU::Pseudo_SReg_32RegClass, + &AMDGPU::Pseudo_SReg_128RegClass, }; for (const TargetRegisterClass *BaseClass : BaseClasses) { @@ -1355,7 +1375,7 @@ bool SIRegisterInfo::shouldRewriteCopySrc( return getCommonSubClass(DefRC, SrcRC) != nullptr; } -/// \brief Returns a register that is not used at any point in the function. +/// Returns a register that is not used at any point in the function. /// If all registers are used, then this function will return // AMDGPU::NoRegister. unsigned @@ -1483,7 +1503,9 @@ SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI, bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const { - return hasVGPRs(getRegClassForReg(MRI, Reg)); + const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg); + assert(RC && "Register class for the reg not found"); + return hasVGPRs(RC); } bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, @@ -1510,7 +1532,7 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), @@ -1545,3 +1567,34 @@ const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const { return Empty; return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit); } + +unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const { + // Not a callee saved register. + return AMDGPU::SGPR30_SGPR31; +} + +const TargetRegisterClass * +SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, + const MachineRegisterInfo &MRI) const { + unsigned Size = getRegSizeInBits(MO.getReg(), MRI); + const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()); + if (!RB) + return nullptr; + + switch (Size) { + case 32: + return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass : + &AMDGPU::SReg_32_XM0RegClass; + case 64: + return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass : + &AMDGPU::SReg_64_XEXECRegClass; + case 96: + return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass : + nullptr; + case 128: + return RB->getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass : + &AMDGPU::SReg_128RegClass; + default: + llvm_unreachable("not implemented"); + } +} diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index bf814b6974a8..5a51b67ca719 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Interface definition for SIRegisterInfo +/// Interface definition for SIRegisterInfo // //===----------------------------------------------------------------------===// @@ -16,15 +16,14 @@ #define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H #include "AMDGPURegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" #include "llvm/CodeGen/MachineRegisterInfo.h" namespace llvm { +class GCNSubtarget; class LiveIntervals; class MachineRegisterInfo; -class SISubtarget; class SIMachineFunctionInfo; class SIRegisterInfo final : public AMDGPURegisterInfo { @@ -36,11 +35,10 @@ private: bool SpillSGPRToVGPR; bool SpillSGPRToSMEM; - void reserveRegisterTuples(BitVector &, unsigned Reg) const; void classifyPressureSet(unsigned PSetID, unsigned Reg, BitVector &PressureSets) const; public: - SIRegisterInfo(const SISubtarget &ST); + SIRegisterInfo(const GCNSubtarget &ST); bool spillSGPRToVGPR() const { return SpillSGPRToVGPR; @@ -126,7 +124,7 @@ public: return getEncodingValue(Reg) & 0xff; } - /// \brief Return the 'base' register class for this register. + /// Return the 'base' register class for this register. /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc. const TargetRegisterClass *getPhysRegClass(unsigned Reg) const; @@ -224,10 +222,11 @@ public: const int *getRegUnitPressureSets(unsigned RegUnit) const override; - unsigned getReturnAddressReg(const MachineFunction &MF) const { - // Not a callee saved register. - return AMDGPU::SGPR30_SGPR31; - } + unsigned getReturnAddressReg(const MachineFunction &MF) const; + + const TargetRegisterClass * + getConstrainedRegClassForOperand(const MachineOperand &MO, + const MachineRegisterInfo &MRI) const override; private: void buildSpillLoadStore(MachineBasicBlock::iterator MI, diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index dd0efef7f91b..f87a0763b353 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -76,6 +76,16 @@ def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>; def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>; def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>; +def XNACK_MASK_LO : SIReg<"xnack_mask_lo", 104>; +def XNACK_MASK_HI : SIReg<"xnack_mask_hi", 105>; + +def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>, + DwarfRegAlias<XNACK_MASK_LO> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 104; +} + // Trap handler registers def TBA_LO : SIReg<"tba_lo", 108>; def TBA_HI : SIReg<"tba_hi", 109>; @@ -394,7 +404,7 @@ def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], let CopyCost = -1; } -def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64], 32, +def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32, (add PRIVATE_RSRC_REG)> { let isAllocatable = 0; let CopyCost = -1; @@ -403,7 +413,7 @@ def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64], 32, // Subset of SReg_32 without M0 for SMRD instructions and alike. // See comments in SIInstructions.td for more info. def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, + (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> { let AllocationPriority = 7; @@ -425,22 +435,22 @@ def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, let AllocationPriority = 7; } -def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> { +def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> { let CopyCost = 1; let AllocationPriority = 8; } -def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> { +def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add TTMP_64Regs)> { let isAllocatable = 0; } -def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, - (add SGPR_64, VCC, FLAT_SCR, TTMP_64, TBA, TMA)> { +def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32, + (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> { let CopyCost = 1; let AllocationPriority = 8; } -def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, +def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32, (add SReg_64_XEXEC, EXEC)> { let CopyCost = 1; let AllocationPriority = 8; @@ -457,7 +467,7 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128R let isAllocatable = 0; } -def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64, v2f64], 32, (add SGPR_128, TTMP_128)> { let AllocationPriority = 10; } @@ -495,7 +505,7 @@ def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, } // Register class for all vector registers (VGPRs + Interploation Registers) -def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> { +def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, (add VGPR_64)> { let Size = 64; // Requires 2 v_mov_b32 to copy diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td index 0f02f5825cb0..7af69cb6a46d 100644 --- a/lib/Target/AMDGPU/SISchedule.td +++ b/lib/Target/AMDGPU/SISchedule.td @@ -46,7 +46,7 @@ def Write64Bit : SchedWrite; // instructions) class SISchedMachineModel : SchedMachineModel { - let CompleteModel = 1; + let CompleteModel = 0; // MicroOpBufferSize = 1 means that instructions will always be added // the ready queue when they become available. This exposes them // to the register pressure analysis. diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 41f989ad3228..4189bcce52ea 100644 --- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -10,9 +10,9 @@ // #include "AMDGPU.h" -#include "AMDGPUMCInstLower.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -64,17 +64,6 @@ FunctionPass *llvm::createSIShrinkInstructionsPass() { return new SIShrinkInstructions(); } -static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI, - const MachineRegisterInfo &MRI) { - if (!MO->isReg()) - return false; - - if (TargetRegisterInfo::isVirtualRegister(MO->getReg())) - return TRI.hasVGPRs(MRI.getRegClass(MO->getReg())); - - return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg())); -} - static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) { @@ -92,14 +81,18 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, case AMDGPU::V_ADDC_U32_e64: case AMDGPU::V_SUBB_U32_e64: - if (TII->getNamedOperand(MI, AMDGPU::OpName::src1)->isImm()) + case AMDGPU::V_SUBBREV_U32_e64: { + const MachineOperand *Src1 + = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (!Src1->isReg() || !TRI.isVGPR(MRI, Src1->getReg())) return false; // Additional verification is needed for sdst/src2. return true; - + } case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_F16_e64: - if (!isVGPR(Src2, TRI, MRI) || + case AMDGPU::V_FMAC_F32_e64: + if (!Src2->isReg() || !TRI.isVGPR(MRI, Src2->getReg()) || TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) return false; break; @@ -110,7 +103,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, } const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - if (Src1 && (!isVGPR(Src1, TRI, MRI) || + if (Src1 && (!Src1->isReg() || !TRI.isVGPR(MRI, Src1->getReg()) || TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) return false; @@ -124,7 +117,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp); } -/// \brief This function checks \p MI for operands defined by a move immediate +/// This function checks \p MI for operands defined by a move immediate /// instruction and then folds the literal constant into the instruction if it /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, @@ -290,7 +283,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { return false; MachineRegisterInfo &MRI = MF.getRegInfo(); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); @@ -442,7 +435,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) // // So, instead of forcing the instruction to write to VCC, we provide - // a hint to the register allocator to use VCC and then we we will run + // a hint to the register allocator to use VCC and then we will run // this pass again after RA and shrink it if it outputs to VCC. MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC); continue; @@ -493,7 +486,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { } // We can shrink this instruction - DEBUG(dbgs() << "Shrinking " << MI); + LLVM_DEBUG(dbgs() << "Shrinking " << MI); MachineInstrBuilder Inst32 = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); @@ -537,9 +530,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MI.eraseFromParent(); foldImmediates(*Inst32, TII, MRI); - DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); - - + LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n'); } } return false; diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 53aefe829737..879726b1528c 100644 --- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief This pass adds instructions to enable whole quad mode for pixel +/// This pass adds instructions to enable whole quad mode for pixel /// shaders, and whole wavefront mode for all programs. /// /// Whole quad mode is required for derivative computations, but it interferes @@ -60,6 +60,7 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" @@ -325,9 +326,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, unsigned Opcode = MI.getOpcode(); char Flags = 0; - if (TII->isDS(Opcode) && CallingConv == CallingConv::AMDGPU_PS) { - Flags = StateWQM; - } else if (TII->isWQM(Opcode)) { + if (TII->isWQM(Opcode)) { // Sampling instructions don't need to produce results for all pixels // in a quad, they just require all inputs of a quad to have been // computed for derivatives. @@ -454,6 +453,11 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, if (II.Needs != 0) markInstructionUses(MI, II.Needs, Worklist); + + // Ensure we process a block containing WWM, even if it does not require any + // WQM transitions. + if (II.Needs & StateWWM) + BI.Needs |= StateWWM; } void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, @@ -681,7 +685,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) return; - DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) << ":\n"); + LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) + << ":\n"); unsigned SavedWQMReg = 0; unsigned SavedNonWWMReg = 0; @@ -844,7 +849,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { LowerToCopyInstrs.clear(); CallingConv = MF.getFunction().getCallingConv(); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); @@ -884,7 +889,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { } } - DEBUG(printInfo()); + LLVM_DEBUG(printInfo()); lowerCopyInstrs(); diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td index 8f347986eb8a..7485326017b2 100644 --- a/lib/Target/AMDGPU/SMInstructions.td +++ b/lib/Target/AMDGPU/SMInstructions.td @@ -63,6 +63,18 @@ class SM_Real <SM_Pseudo ps> bits<1> imm = !if(ps.has_offset, ps.offset_is_imm, 0); } +class SM_Probe_Pseudo <string opName, dag ins, bit isImm> + : SM_Pseudo<opName, (outs), ins, " $sdata, $sbase, $offset"> { + let mayLoad = 0; + let mayStore = 0; + let has_glc = 0; + let LGKM_CNT = 0; + let ScalarStore = 0; + let hasSideEffects = 1; + let offset_is_imm = isImm; + let PseudoInstr = opName # !if(isImm, "_IMM", "_SGPR"); +} + class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> : SM_Pseudo<opName, outs, ins, asmOps, pattern> { RegisterClass BaseClass; @@ -81,6 +93,18 @@ class SM_Store_Pseudo <string opName, dag ins, string asmOps, list<dag> pattern let ScalarStore = 1; } +class SM_Discard_Pseudo <string opName, dag ins, bit isImm> + : SM_Pseudo<opName, (outs), ins, " $sbase, $offset"> { + let mayLoad = 0; + let mayStore = 0; + let has_glc = 0; + let has_sdst = 0; + let ScalarStore = 0; + let hasSideEffects = 1; + let offset_is_imm = isImm; + let PseudoInstr = opName # !if(isImm, "_IMM", "_SGPR"); +} + multiclass SM_Pseudo_Loads<string opName, RegisterClass baseClass, RegisterClass dstClass> { @@ -125,6 +149,11 @@ multiclass SM_Pseudo_Stores<string opName, } } +multiclass SM_Pseudo_Discards<string opName> { + def _IMM : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, smrd_offset_20:$offset), 1>; + def _SGPR : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, SReg_32:$offset), 0>; +} + class SM_Time_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo< opName, (outs SReg_64_XEXEC:$sdst), (ins), " $sdst", [(set i64:$sdst, (node))]> { @@ -144,6 +173,60 @@ class SM_Inval_Pseudo <string opName, SDPatternOperator node> : SM_Pseudo< let has_offset = 0; } +multiclass SM_Pseudo_Probe<string opName, RegisterClass baseClass> { + def _IMM : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, smrd_offset_20:$offset), 1>; + def _SGPR : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, SReg_32:$offset), 0>; +} + +//===----------------------------------------------------------------------===// +// Scalar Atomic Memory Classes +//===----------------------------------------------------------------------===// + +class SM_Atomic_Pseudo <string opName, + dag outs, dag ins, string asmOps, bit isRet> + : SM_Pseudo<opName, outs, ins, asmOps, []> { + + bit glc = isRet; + + let mayLoad = 1; + let mayStore = 1; + let has_glc = 1; + + // Should these be set? + let ScalarStore = 1; + let hasSideEffects = 1; + let maybeAtomic = 1; +} + +class SM_Pseudo_Atomic<string opName, + RegisterClass baseClass, + RegisterClass dataClass, + bit isImm, + bit isRet> : + SM_Atomic_Pseudo<opName, + !if(isRet, (outs dataClass:$sdst), (outs)), + !if(isImm, + (ins dataClass:$sdata, baseClass:$sbase, smrd_offset_20:$offset), + (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset)), + !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset" # !if(isRet, " glc", ""), + isRet> { + let offset_is_imm = isImm; + let PseudoInstr = opName # !if(isImm, + !if(isRet, "_IMM_RTN", "_IMM"), + !if(isRet, "_SGPR_RTN", "_SGPR")); + + let Constraints = !if(isRet, "$sdst = $sdata", ""); + let DisableEncoding = !if(isRet, "$sdata", ""); +} + +multiclass SM_Pseudo_Atomics<string opName, + RegisterClass baseClass, + RegisterClass dataClass> { + def _IMM : SM_Pseudo_Atomic <opName, baseClass, dataClass, 1, 0>; + def _SGPR : SM_Pseudo_Atomic <opName, baseClass, dataClass, 0, 0>; + def _IMM_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, 1, 1>; + def _SGPR_RTN : SM_Pseudo_Atomic <opName, baseClass, dataClass, 0, 1>; +} //===----------------------------------------------------------------------===// // Scalar Memory Instructions @@ -211,9 +294,85 @@ let SubtargetPredicate = isVI in { def S_DCACHE_WB : SM_Inval_Pseudo <"s_dcache_wb", int_amdgcn_s_dcache_wb>; def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>; def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>; -} // SubtargetPredicate = isVI +defm S_ATC_PROBE : SM_Pseudo_Probe <"s_atc_probe", SReg_64>; +defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <"s_atc_probe_buffer", SReg_128>; +} // SubtargetPredicate = isVI +let SubtargetPredicate = HasFlatScratchInsts, Uses = [FLAT_SCR] in { +defm S_SCRATCH_LOAD_DWORD : SM_Pseudo_Loads <"s_scratch_load_dword", SReg_64, SReg_32_XM0_XEXEC>; +defm S_SCRATCH_LOAD_DWORDX2 : SM_Pseudo_Loads <"s_scratch_load_dwordx2", SReg_64, SReg_64_XEXEC>; +defm S_SCRATCH_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_scratch_load_dwordx4", SReg_64, SReg_128>; + +defm S_SCRATCH_STORE_DWORD : SM_Pseudo_Stores <"s_scratch_store_dword", SReg_64, SReg_32_XM0_XEXEC>; +defm S_SCRATCH_STORE_DWORDX2 : SM_Pseudo_Stores <"s_scratch_store_dwordx2", SReg_64, SReg_64_XEXEC>; +defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores <"s_scratch_store_dwordx4", SReg_64, SReg_128>; +} // SubtargetPredicate = HasFlatScratchInsts + +let SubtargetPredicate = HasScalarAtomics in { + +defm S_BUFFER_ATOMIC_SWAP : SM_Pseudo_Atomics <"s_buffer_atomic_swap", SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <"s_buffer_atomic_cmpswap", SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_ADD : SM_Pseudo_Atomics <"s_buffer_atomic_add", SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_SUB : SM_Pseudo_Atomics <"s_buffer_atomic_sub", SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_SMIN : SM_Pseudo_Atomics <"s_buffer_atomic_smin", SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_UMIN : SM_Pseudo_Atomics <"s_buffer_atomic_umin", SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_SMAX : SM_Pseudo_Atomics <"s_buffer_atomic_smax", SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_UMAX : SM_Pseudo_Atomics <"s_buffer_atomic_umax", SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_AND : SM_Pseudo_Atomics <"s_buffer_atomic_and", SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_OR : SM_Pseudo_Atomics <"s_buffer_atomic_or", SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_XOR : SM_Pseudo_Atomics <"s_buffer_atomic_xor", SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_INC : SM_Pseudo_Atomics <"s_buffer_atomic_inc", SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_ATOMIC_DEC : SM_Pseudo_Atomics <"s_buffer_atomic_dec", SReg_128, SReg_32_XM0_XEXEC>; + +defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_swap_x2", SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_cmpswap_x2", SReg_128, SReg_128>; +defm S_BUFFER_ATOMIC_ADD_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_add_x2", SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_SUB_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_sub_x2", SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_smin_x2", SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_umin_x2", SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_smax_x2", SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_umax_x2", SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_AND_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_and_x2", SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_OR_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_or_x2", SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_XOR_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_xor_x2", SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_INC_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_inc_x2", SReg_128, SReg_64_XEXEC>; +defm S_BUFFER_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_dec_x2", SReg_128, SReg_64_XEXEC>; + +defm S_ATOMIC_SWAP : SM_Pseudo_Atomics <"s_atomic_swap", SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <"s_atomic_cmpswap", SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_ADD : SM_Pseudo_Atomics <"s_atomic_add", SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_SUB : SM_Pseudo_Atomics <"s_atomic_sub", SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_SMIN : SM_Pseudo_Atomics <"s_atomic_smin", SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_UMIN : SM_Pseudo_Atomics <"s_atomic_umin", SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_SMAX : SM_Pseudo_Atomics <"s_atomic_smax", SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_UMAX : SM_Pseudo_Atomics <"s_atomic_umax", SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_AND : SM_Pseudo_Atomics <"s_atomic_and", SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_OR : SM_Pseudo_Atomics <"s_atomic_or", SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_XOR : SM_Pseudo_Atomics <"s_atomic_xor", SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_INC : SM_Pseudo_Atomics <"s_atomic_inc", SReg_64, SReg_32_XM0_XEXEC>; +defm S_ATOMIC_DEC : SM_Pseudo_Atomics <"s_atomic_dec", SReg_64, SReg_32_XM0_XEXEC>; + +defm S_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics <"s_atomic_swap_x2", SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics <"s_atomic_cmpswap_x2", SReg_64, SReg_128>; +defm S_ATOMIC_ADD_X2 : SM_Pseudo_Atomics <"s_atomic_add_x2", SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_SUB_X2 : SM_Pseudo_Atomics <"s_atomic_sub_x2", SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics <"s_atomic_smin_x2", SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics <"s_atomic_umin_x2", SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics <"s_atomic_smax_x2", SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics <"s_atomic_umax_x2", SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_AND_X2 : SM_Pseudo_Atomics <"s_atomic_and_x2", SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_OR_X2 : SM_Pseudo_Atomics <"s_atomic_or_x2", SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_XOR_X2 : SM_Pseudo_Atomics <"s_atomic_xor_x2", SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_INC_X2 : SM_Pseudo_Atomics <"s_atomic_inc_x2", SReg_64, SReg_64_XEXEC>; +defm S_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <"s_atomic_dec_x2", SReg_64, SReg_64_XEXEC>; + +} // let SubtargetPredicate = HasScalarAtomics + +let SubtargetPredicate = isGFX9 in { +defm S_DCACHE_DISCARD : SM_Pseudo_Discards <"s_dcache_discard">; +defm S_DCACHE_DISCARD_X2 : SM_Pseudo_Discards <"s_dcache_discard_x2">; +} //===----------------------------------------------------------------------===// // Scalar Memory Patterns @@ -223,11 +382,9 @@ def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime> def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ auto Ld = cast<LoadSDNode>(N); return Ld->getAlignment() >= 4 && - ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && - static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) || + ((((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) || (Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT)) && !N->isDivergent()) || (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && - !Ld->isVolatile() && - static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) && + !Ld->isVolatile() && !N->isDivergent() && static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N))); }]>; @@ -407,6 +564,11 @@ multiclass SM_Real_Stores_vi<bits<8> op, string ps, } } +multiclass SM_Real_Probe_vi<bits<8> op, string ps> { + def _IMM_vi : SMEM_Real_Store_vi <op, !cast<SM_Probe_Pseudo>(ps#_IMM)>; + def _SGPR_vi : SMEM_Real_Store_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>; +} + defm S_LOAD_DWORD : SM_Real_Loads_vi <0x00, "S_LOAD_DWORD">; defm S_LOAD_DWORDX2 : SM_Real_Loads_vi <0x01, "S_LOAD_DWORDX2">; defm S_LOAD_DWORDX4 : SM_Real_Loads_vi <0x02, "S_LOAD_DWORDX4">; @@ -434,6 +596,103 @@ def S_DCACHE_WB_VOL_vi : SMEM_Real_vi <0x23, S_DCACHE_WB_VOL>; def S_MEMTIME_vi : SMEM_Real_vi <0x24, S_MEMTIME>; def S_MEMREALTIME_vi : SMEM_Real_vi <0x25, S_MEMREALTIME>; +defm S_SCRATCH_LOAD_DWORD : SM_Real_Loads_vi <0x05, "S_SCRATCH_LOAD_DWORD">; +defm S_SCRATCH_LOAD_DWORDX2 : SM_Real_Loads_vi <0x06, "S_SCRATCH_LOAD_DWORDX2">; +defm S_SCRATCH_LOAD_DWORDX4 : SM_Real_Loads_vi <0x07, "S_SCRATCH_LOAD_DWORDX4">; + +defm S_SCRATCH_STORE_DWORD : SM_Real_Stores_vi <0x15, "S_SCRATCH_STORE_DWORD">; +defm S_SCRATCH_STORE_DWORDX2 : SM_Real_Stores_vi <0x16, "S_SCRATCH_STORE_DWORDX2">; +defm S_SCRATCH_STORE_DWORDX4 : SM_Real_Stores_vi <0x17, "S_SCRATCH_STORE_DWORDX4">; + +defm S_ATC_PROBE : SM_Real_Probe_vi <0x26, "S_ATC_PROBE">; +defm S_ATC_PROBE_BUFFER : SM_Real_Probe_vi <0x27, "S_ATC_PROBE_BUFFER">; + +//===----------------------------------------------------------------------===// +// GFX9 +//===----------------------------------------------------------------------===// + +class SMEM_Atomic_Real_vi <bits<8> op, SM_Atomic_Pseudo ps> + : SMEM_Real_vi <op, ps> { + + bits<7> sdata; + + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + + let glc = ps.glc; + let Inst{12-6} = !if(glc, sdst{6-0}, sdata{6-0}); +} + +multiclass SM_Real_Atomics_vi<bits<8> op, string ps> { + def _IMM_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_IMM)>; + def _SGPR_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>; + def _IMM_RTN_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_IMM_RTN)>; + def _SGPR_RTN_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_RTN)>; +} + +defm S_BUFFER_ATOMIC_SWAP : SM_Real_Atomics_vi <0x40, "S_BUFFER_ATOMIC_SWAP">; +defm S_BUFFER_ATOMIC_CMPSWAP : SM_Real_Atomics_vi <0x41, "S_BUFFER_ATOMIC_CMPSWAP">; +defm S_BUFFER_ATOMIC_ADD : SM_Real_Atomics_vi <0x42, "S_BUFFER_ATOMIC_ADD">; +defm S_BUFFER_ATOMIC_SUB : SM_Real_Atomics_vi <0x43, "S_BUFFER_ATOMIC_SUB">; +defm S_BUFFER_ATOMIC_SMIN : SM_Real_Atomics_vi <0x44, "S_BUFFER_ATOMIC_SMIN">; +defm S_BUFFER_ATOMIC_UMIN : SM_Real_Atomics_vi <0x45, "S_BUFFER_ATOMIC_UMIN">; +defm S_BUFFER_ATOMIC_SMAX : SM_Real_Atomics_vi <0x46, "S_BUFFER_ATOMIC_SMAX">; +defm S_BUFFER_ATOMIC_UMAX : SM_Real_Atomics_vi <0x47, "S_BUFFER_ATOMIC_UMAX">; +defm S_BUFFER_ATOMIC_AND : SM_Real_Atomics_vi <0x48, "S_BUFFER_ATOMIC_AND">; +defm S_BUFFER_ATOMIC_OR : SM_Real_Atomics_vi <0x49, "S_BUFFER_ATOMIC_OR">; +defm S_BUFFER_ATOMIC_XOR : SM_Real_Atomics_vi <0x4a, "S_BUFFER_ATOMIC_XOR">; +defm S_BUFFER_ATOMIC_INC : SM_Real_Atomics_vi <0x4b, "S_BUFFER_ATOMIC_INC">; +defm S_BUFFER_ATOMIC_DEC : SM_Real_Atomics_vi <0x4c, "S_BUFFER_ATOMIC_DEC">; + +defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Real_Atomics_vi <0x60, "S_BUFFER_ATOMIC_SWAP_X2">; +defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_vi <0x61, "S_BUFFER_ATOMIC_CMPSWAP_X2">; +defm S_BUFFER_ATOMIC_ADD_X2 : SM_Real_Atomics_vi <0x62, "S_BUFFER_ATOMIC_ADD_X2">; +defm S_BUFFER_ATOMIC_SUB_X2 : SM_Real_Atomics_vi <0x63, "S_BUFFER_ATOMIC_SUB_X2">; +defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Real_Atomics_vi <0x64, "S_BUFFER_ATOMIC_SMIN_X2">; +defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Real_Atomics_vi <0x65, "S_BUFFER_ATOMIC_UMIN_X2">; +defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Real_Atomics_vi <0x66, "S_BUFFER_ATOMIC_SMAX_X2">; +defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Real_Atomics_vi <0x67, "S_BUFFER_ATOMIC_UMAX_X2">; +defm S_BUFFER_ATOMIC_AND_X2 : SM_Real_Atomics_vi <0x68, "S_BUFFER_ATOMIC_AND_X2">; +defm S_BUFFER_ATOMIC_OR_X2 : SM_Real_Atomics_vi <0x69, "S_BUFFER_ATOMIC_OR_X2">; +defm S_BUFFER_ATOMIC_XOR_X2 : SM_Real_Atomics_vi <0x6a, "S_BUFFER_ATOMIC_XOR_X2">; +defm S_BUFFER_ATOMIC_INC_X2 : SM_Real_Atomics_vi <0x6b, "S_BUFFER_ATOMIC_INC_X2">; +defm S_BUFFER_ATOMIC_DEC_X2 : SM_Real_Atomics_vi <0x6c, "S_BUFFER_ATOMIC_DEC_X2">; + +defm S_ATOMIC_SWAP : SM_Real_Atomics_vi <0x80, "S_ATOMIC_SWAP">; +defm S_ATOMIC_CMPSWAP : SM_Real_Atomics_vi <0x81, "S_ATOMIC_CMPSWAP">; +defm S_ATOMIC_ADD : SM_Real_Atomics_vi <0x82, "S_ATOMIC_ADD">; +defm S_ATOMIC_SUB : SM_Real_Atomics_vi <0x83, "S_ATOMIC_SUB">; +defm S_ATOMIC_SMIN : SM_Real_Atomics_vi <0x84, "S_ATOMIC_SMIN">; +defm S_ATOMIC_UMIN : SM_Real_Atomics_vi <0x85, "S_ATOMIC_UMIN">; +defm S_ATOMIC_SMAX : SM_Real_Atomics_vi <0x86, "S_ATOMIC_SMAX">; +defm S_ATOMIC_UMAX : SM_Real_Atomics_vi <0x87, "S_ATOMIC_UMAX">; +defm S_ATOMIC_AND : SM_Real_Atomics_vi <0x88, "S_ATOMIC_AND">; +defm S_ATOMIC_OR : SM_Real_Atomics_vi <0x89, "S_ATOMIC_OR">; +defm S_ATOMIC_XOR : SM_Real_Atomics_vi <0x8a, "S_ATOMIC_XOR">; +defm S_ATOMIC_INC : SM_Real_Atomics_vi <0x8b, "S_ATOMIC_INC">; +defm S_ATOMIC_DEC : SM_Real_Atomics_vi <0x8c, "S_ATOMIC_DEC">; + +defm S_ATOMIC_SWAP_X2 : SM_Real_Atomics_vi <0xa0, "S_ATOMIC_SWAP_X2">; +defm S_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_vi <0xa1, "S_ATOMIC_CMPSWAP_X2">; +defm S_ATOMIC_ADD_X2 : SM_Real_Atomics_vi <0xa2, "S_ATOMIC_ADD_X2">; +defm S_ATOMIC_SUB_X2 : SM_Real_Atomics_vi <0xa3, "S_ATOMIC_SUB_X2">; +defm S_ATOMIC_SMIN_X2 : SM_Real_Atomics_vi <0xa4, "S_ATOMIC_SMIN_X2">; +defm S_ATOMIC_UMIN_X2 : SM_Real_Atomics_vi <0xa5, "S_ATOMIC_UMIN_X2">; +defm S_ATOMIC_SMAX_X2 : SM_Real_Atomics_vi <0xa6, "S_ATOMIC_SMAX_X2">; +defm S_ATOMIC_UMAX_X2 : SM_Real_Atomics_vi <0xa7, "S_ATOMIC_UMAX_X2">; +defm S_ATOMIC_AND_X2 : SM_Real_Atomics_vi <0xa8, "S_ATOMIC_AND_X2">; +defm S_ATOMIC_OR_X2 : SM_Real_Atomics_vi <0xa9, "S_ATOMIC_OR_X2">; +defm S_ATOMIC_XOR_X2 : SM_Real_Atomics_vi <0xaa, "S_ATOMIC_XOR_X2">; +defm S_ATOMIC_INC_X2 : SM_Real_Atomics_vi <0xab, "S_ATOMIC_INC_X2">; +defm S_ATOMIC_DEC_X2 : SM_Real_Atomics_vi <0xac, "S_ATOMIC_DEC_X2">; + +multiclass SM_Real_Discard_vi<bits<8> op, string ps> { + def _IMM_vi : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_IMM)>; + def _SGPR_vi : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR)>; +} + +defm S_DCACHE_DISCARD : SM_Real_Discard_vi <0x28, "S_DCACHE_DISCARD">; +defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_vi <0x29, "S_DCACHE_DISCARD_X2">; //===----------------------------------------------------------------------===// // CI @@ -502,7 +761,7 @@ let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity in { class SMRD_Pattern_ci <string Instr, ValueType vt> : GCNPat < (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), - (vt (!cast<SM_Pseudo>(Instr#"_IMM_ci") $sbase, $offset, 0))> { + (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> { let OtherPredicates = [isCIOnly]; } diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td index 02a95a4b6f24..6f5db9644c86 100644 --- a/lib/Target/AMDGPU/SOPInstructions.td +++ b/lib/Target/AMDGPU/SOPInstructions.td @@ -19,17 +19,28 @@ def GPRIdxMode : Operand<i32> { let OperandType = "OPERAND_IMMEDIATE"; } +class SOP_Pseudo<string opName, dag outs, dag ins, string asmOps, + list<dag> pattern=[]> : + InstSI<outs, ins, "", pattern>, + SIMCInstr<opName, SIEncodingFamily.NONE> { + + let isPseudo = 1; + let isCodeGenOnly = 1; + let SubtargetPredicate = isGCN; + + string Mnemonic = opName; + string AsmOperands = asmOps; + + bits<1> has_sdst = 0; +} + //===----------------------------------------------------------------------===// // SOP1 Instructions //===----------------------------------------------------------------------===// class SOP1_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> : - InstSI <outs, ins, "", pattern>, - SIMCInstr<opName, SIEncodingFamily.NONE> { - let isPseudo = 1; - let isCodeGenOnly = 1; - let SubtargetPredicate = isGCN; + SOP_Pseudo<opName, outs, ins, asmOps, pattern> { let mayLoad = 0; let mayStore = 0; @@ -40,9 +51,6 @@ class SOP1_Pseudo <string opName, dag outs, dag ins, let Size = 4; let UseNamedOperandTable = 1; - string Mnemonic = opName; - string AsmOperands = asmOps; - bits<1> has_src0 = 1; bits<1> has_sdst = 1; } @@ -247,17 +255,25 @@ def S_SET_GPR_IDX_IDX : SOP1_0_32<"s_set_gpr_idx_idx"> { } } +let SubtargetPredicate = isGFX9 in { + let hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] in { + def S_ANDN1_SAVEEXEC_B64 : SOP1_64<"s_andn1_saveexec_b64">; + def S_ORN1_SAVEEXEC_B64 : SOP1_64<"s_orn1_saveexec_b64">; + def S_ANDN1_WREXEC_B64 : SOP1_64<"s_andn1_wrexec_b64">; + def S_ANDN2_WREXEC_B64 : SOP1_64<"s_andn2_wrexec_b64">; + } // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC] + + def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">; +} // End SubtargetPredicate = isGFX9 + //===----------------------------------------------------------------------===// // SOP2 Instructions //===----------------------------------------------------------------------===// class SOP2_Pseudo<string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> : - InstSI<outs, ins, "", pattern>, - SIMCInstr<opName, SIEncodingFamily.NONE> { - let isPseudo = 1; - let isCodeGenOnly = 1; - let SubtargetPredicate = isGCN; + SOP_Pseudo<opName, outs, ins, asmOps, pattern> { + let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; @@ -266,10 +282,7 @@ class SOP2_Pseudo<string opName, dag outs, dag ins, let SchedRW = [WriteSALU]; let UseNamedOperandTable = 1; - string Mnemonic = opName; - string AsmOperands = asmOps; - - bits<1> has_sdst = 1; + let has_sdst = 1; // Pseudo instructions have no encodings, but adding this field here allows // us to do: @@ -279,7 +292,7 @@ class SOP2_Pseudo<string opName, dag outs, dag ins, // let Size = 4; // Do we need size here? } -class SOP2_Real<bits<7> op, SOP2_Pseudo ps> : +class SOP2_Real<bits<7> op, SOP_Pseudo ps> : InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # " " # ps.AsmOperands, []>, Enc32 { @@ -482,6 +495,16 @@ let SubtargetPredicate = isGFX9 in { def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">; def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">; def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">; + + let Defs = [SCC] in { + def S_LSHL1_ADD_U32 : SOP2_32<"s_lshl1_add_u32">; + def S_LSHL2_ADD_U32 : SOP2_32<"s_lshl2_add_u32">; + def S_LSHL3_ADD_U32 : SOP2_32<"s_lshl3_add_u32">; + def S_LSHL4_ADD_U32 : SOP2_32<"s_lshl4_add_u32">; + } // End Defs = [SCC] + + def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32">; + def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32">; } //===----------------------------------------------------------------------===// @@ -659,6 +682,16 @@ def S_SETREG_IMM32_B32 : SOPK_Pseudo < } // End hasSideEffects = 1 +let SubtargetPredicate = isGFX9 in { + def S_CALL_B64 : SOPK_Pseudo< + "s_call_b64", + (outs SReg_64:$sdst), + (ins s16imm:$simm16), + "$sdst, $simm16"> { + let isCall = 1; + } +} + //===----------------------------------------------------------------------===// // SOPC Instructions //===----------------------------------------------------------------------===// @@ -806,6 +839,13 @@ def S_ENDPGM_SAVED : SOPP <0x0000001B, (ins), "s_endpgm_saved"> { } } +let SubtargetPredicate = isGFX9 in { + let isBarrier = 1, isReturn = 1, simm16 = 0 in { + def S_ENDPGM_ORDERED_PS_DONE : + SOPP<0x01e, (ins), "s_endpgm_ordered_ps_done">; + } // End isBarrier = 1, isReturn = 1, simm16 = 0 +} // End SubtargetPredicate = isGFX9 + let isBranch = 1, SchedRW = [WriteBranch] in { def S_BRANCH : SOPP < 0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16", @@ -1312,3 +1352,26 @@ def S_SETREG_B32_vi : SOPK_Real_vi <0x12, S_SETREG_B32>; //def S_GETREG_REGRD_B32_vi : SOPK_Real_vi <0x13, S_GETREG_REGRD_B32>; // see pseudo for comments def S_SETREG_IMM32_B32_vi : SOPK_Real64<0x14, S_SETREG_IMM32_B32>, Select_vi<S_SETREG_IMM32_B32.Mnemonic>; + +def S_CALL_B64_vi : SOPK_Real_vi <0x15, S_CALL_B64>; + +//===----------------------------------------------------------------------===// +// SOP1 - GFX9. +//===----------------------------------------------------------------------===// + +def S_ANDN1_SAVEEXEC_B64_vi : SOP1_Real_vi<0x33, S_ANDN1_SAVEEXEC_B64>; +def S_ORN1_SAVEEXEC_B64_vi : SOP1_Real_vi<0x34, S_ORN1_SAVEEXEC_B64>; +def S_ANDN1_WREXEC_B64_vi : SOP1_Real_vi<0x35, S_ANDN1_WREXEC_B64>; +def S_ANDN2_WREXEC_B64_vi : SOP1_Real_vi<0x36, S_ANDN2_WREXEC_B64>; +def S_BITREPLICATE_B64_B32_vi : SOP1_Real_vi<0x37, S_BITREPLICATE_B64_B32>; + +//===----------------------------------------------------------------------===// +// SOP2 - GFX9. +//===----------------------------------------------------------------------===// + +def S_LSHL1_ADD_U32_vi : SOP2_Real_vi<0x2e, S_LSHL1_ADD_U32>; +def S_LSHL2_ADD_U32_vi : SOP2_Real_vi<0x2f, S_LSHL2_ADD_U32>; +def S_LSHL3_ADD_U32_vi : SOP2_Real_vi<0x30, S_LSHL3_ADD_U32>; +def S_LSHL4_ADD_U32_vi : SOP2_Real_vi<0x31, S_LSHL4_ADD_U32>; +def S_MUL_HI_U32_vi : SOP2_Real_vi<0x2c, S_MUL_HI_U32>; +def S_MUL_HI_I32_vi : SOP2_Real_vi<0x2d, S_MUL_HI_I32>; diff --git a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp index f61e2e413ad4..e4c442db3016 100644 --- a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp +++ b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp @@ -16,19 +16,19 @@ using namespace llvm; -/// \brief The target which supports all AMD GPUs. This will eventually +/// The target which supports all AMD GPUs. This will eventually /// be deprecated and there will be a R600 target and a GCN target. Target &llvm::getTheAMDGPUTarget() { static Target TheAMDGPUTarget; return TheAMDGPUTarget; } -/// \brief The target for GCN GPUs +/// The target for GCN GPUs Target &llvm::getTheGCNTarget() { static Target TheGCNTarget; return TheGCNTarget; } -/// \brief Extern function to initialize the targets for the AMDGPU backend +/// Extern function to initialize the targets for the AMDGPU backend extern "C" void LLVMInitializeAMDGPUTargetInfo() { RegisterTarget<Triple::r600, false> R600(getTheAMDGPUTarget(), "r600", "AMD GPUs HD2XXX-HD6XXX", "AMDGPU"); diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index 03b11ae80500..9eb4c6513cce 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -61,7 +61,15 @@ const char* const IdSymbolic[] = { "HW_REG_HW_ID", "HW_REG_GPR_ALLOC", "HW_REG_LDS_ALLOC", - "HW_REG_IB_STS" + "HW_REG_IB_STS", + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + "HW_REG_SH_MEM_BASES" }; } // namespace Hwreg diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 125a3b22d0cf..3fd3c75874a3 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUBaseInfo.h" +#include "AMDGPUTargetTransformInfo.h" #include "AMDGPU.h" #include "SIDefines.h" #include "llvm/ADT/StringRef.h" @@ -52,7 +53,7 @@ unsigned getBitMask(unsigned Shift, unsigned Width) { return ((1 << Width) - 1) << Shift; } -/// \brief Packs \p Src into \p Dst for given bit \p Shift and bit \p Width. +/// Packs \p Src into \p Dst for given bit \p Shift and bit \p Width. /// /// \returns Packed \p Dst. unsigned packBits(unsigned Src, unsigned Dst, unsigned Shift, unsigned Width) { @@ -61,7 +62,7 @@ unsigned packBits(unsigned Src, unsigned Dst, unsigned Shift, unsigned Width) { return Dst; } -/// \brief Unpacks bits from \p Src for given bit \p Shift and bit \p Width. +/// Unpacks bits from \p Src for given bit \p Shift and bit \p Width. /// /// \returns Unpacked bits. unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) { @@ -96,64 +97,34 @@ unsigned getVmcntBitWidthHi() { return 2; } namespace llvm { -static cl::opt<bool> EnablePackedInlinableLiterals( - "enable-packed-inlinable-literals", - cl::desc("Enable packed inlinable literals (v2f16, v2i16)"), - cl::init(false)); - namespace AMDGPU { -LLVM_READNONE -static inline Channels indexToChannel(unsigned Channel) { - switch (Channel) { - case 1: - return AMDGPU::Channels_1; - case 2: - return AMDGPU::Channels_2; - case 3: - return AMDGPU::Channels_3; - case 4: - return AMDGPU::Channels_4; - default: - llvm_unreachable("invalid MIMG channel"); - } -} +struct MIMGInfo { + uint16_t Opcode; + uint16_t BaseOpcode; + uint8_t MIMGEncoding; + uint8_t VDataDwords; + uint8_t VAddrDwords; +}; +#define GET_MIMGBaseOpcodesTable_IMPL +#define GET_MIMGDimInfoTable_IMPL +#define GET_MIMGInfoTable_IMPL +#include "AMDGPUGenSearchableTables.inc" -// FIXME: Need to handle d16 images correctly. -static unsigned rcToChannels(unsigned RCID) { - switch (RCID) { - case AMDGPU::VGPR_32RegClassID: - return 1; - case AMDGPU::VReg_64RegClassID: - return 2; - case AMDGPU::VReg_96RegClassID: - return 3; - case AMDGPU::VReg_128RegClassID: - return 4; - default: - llvm_unreachable("invalid MIMG register class"); - } +int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, + unsigned VDataDwords, unsigned VAddrDwords) { + const MIMGInfo *Info = getMIMGOpcodeHelper(BaseOpcode, MIMGEncoding, + VDataDwords, VAddrDwords); + return Info ? Info->Opcode : -1; } -int getMaskedMIMGOp(const MCInstrInfo &MII, unsigned Opc, unsigned NewChannels) { - AMDGPU::Channels Channel = AMDGPU::indexToChannel(NewChannels); - unsigned OrigChannels = rcToChannels(MII.get(Opc).OpInfo[0].RegClass); - if (NewChannels == OrigChannels) - return Opc; - - switch (OrigChannels) { - case 1: - return AMDGPU::getMaskedMIMGOp1(Opc, Channel); - case 2: - return AMDGPU::getMaskedMIMGOp2(Opc, Channel); - case 3: - return AMDGPU::getMaskedMIMGOp3(Opc, Channel); - case 4: - return AMDGPU::getMaskedMIMGOp4(Opc, Channel); - default: - llvm_unreachable("invalid MIMG channel"); - } +int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) { + const MIMGInfo *OrigInfo = getMIMGInfo(Opc); + const MIMGInfo *NewInfo = + getMIMGOpcodeHelper(OrigInfo->BaseOpcode, OrigInfo->MIMGEncoding, + NewChannels, OrigInfo->VAddrDwords); + return NewInfo ? NewInfo->Opcode : -1; } // Wrapper for Tablegen'd function. enum Subtarget is not defined in any @@ -183,10 +154,10 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) { return {7, 0, 3}; if (Features.test(FeatureISAVersion7_0_4)) return {7, 0, 4}; + if (Features.test(FeatureSeaIslands)) + return {7, 0, 0}; // GCN GFX8 (Volcanic Islands (VI)). - if (Features.test(FeatureISAVersion8_0_0)) - return {8, 0, 0}; if (Features.test(FeatureISAVersion8_0_1)) return {8, 0, 1}; if (Features.test(FeatureISAVersion8_0_2)) @@ -195,14 +166,22 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) { return {8, 0, 3}; if (Features.test(FeatureISAVersion8_1_0)) return {8, 1, 0}; + if (Features.test(FeatureVolcanicIslands)) + return {8, 0, 0}; // GCN GFX9. if (Features.test(FeatureISAVersion9_0_0)) return {9, 0, 0}; if (Features.test(FeatureISAVersion9_0_2)) return {9, 0, 2}; + if (Features.test(FeatureISAVersion9_0_4)) + return {9, 0, 4}; + if (Features.test(FeatureISAVersion9_0_6)) + return {9, 0, 6}; + if (Features.test(FeatureGFX9)) + return {9, 0, 0}; - if (!Features.test(FeatureGCN) || Features.test(FeatureSouthernIslands)) + if (Features.test(FeatureSouthernIslands)) return {0, 0, 0}; return {7, 0, 0}; } @@ -219,11 +198,15 @@ void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) { << ISAVersion.Major << ISAVersion.Minor << ISAVersion.Stepping; + + if (hasXNACK(*STI)) + Stream << "+xnack"; + Stream.flush(); } -bool hasCodeObjectV3(const FeatureBitset &Features) { - return Features.test(FeatureCodeObjectV3); +bool hasCodeObjectV3(const MCSubtargetInfo *STI) { + return STI->getFeatureBits().test(FeatureCodeObjectV3); } unsigned getWavefrontSize(const FeatureBitset &Features) { @@ -260,7 +243,7 @@ unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features, } unsigned getMaxWavesPerCU(const FeatureBitset &Features) { - return getMaxWavesPerEU(Features) * getEUsPerCU(Features); + return getMaxWavesPerEU() * getEUsPerCU(Features); } unsigned getMaxWavesPerCU(const FeatureBitset &Features, @@ -272,9 +255,7 @@ unsigned getMinWavesPerEU(const FeatureBitset &Features) { return 1; } -unsigned getMaxWavesPerEU(const FeatureBitset &Features) { - if (!Features.test(FeatureGCN)) - return 8; +unsigned getMaxWavesPerEU() { // FIXME: Need to take scratch memory into account. return 10; } @@ -330,11 +311,13 @@ unsigned getAddressableNumSGPRs(const FeatureBitset &Features) { unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU) { assert(WavesPerEU != 0); - if (WavesPerEU >= getMaxWavesPerEU(Features)) + if (WavesPerEU >= getMaxWavesPerEU()) return 0; - unsigned MinNumSGPRs = - alignDown(getTotalNumSGPRs(Features) / (WavesPerEU + 1), - getSGPRAllocGranule(Features)) + 1; + + unsigned MinNumSGPRs = getTotalNumSGPRs(Features) / (WavesPerEU + 1); + if (Features.test(FeatureTrapHandler)) + MinNumSGPRs -= std::min(MinNumSGPRs, (unsigned)TRAP_NUM_SGPRS); + MinNumSGPRs = alignDown(MinNumSGPRs, getSGPRAllocGranule(Features)) + 1; return std::min(MinNumSGPRs, getAddressableNumSGPRs(Features)); } @@ -343,14 +326,49 @@ unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU, assert(WavesPerEU != 0); IsaVersion Version = getIsaVersion(Features); - unsigned MaxNumSGPRs = alignDown(getTotalNumSGPRs(Features) / WavesPerEU, - getSGPRAllocGranule(Features)); unsigned AddressableNumSGPRs = getAddressableNumSGPRs(Features); if (Version.Major >= 8 && !Addressable) AddressableNumSGPRs = 112; + unsigned MaxNumSGPRs = getTotalNumSGPRs(Features) / WavesPerEU; + if (Features.test(FeatureTrapHandler)) + MaxNumSGPRs -= std::min(MaxNumSGPRs, (unsigned)TRAP_NUM_SGPRS); + MaxNumSGPRs = alignDown(MaxNumSGPRs, getSGPRAllocGranule(Features)); return std::min(MaxNumSGPRs, AddressableNumSGPRs); } +unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed, + bool FlatScrUsed, bool XNACKUsed) { + unsigned ExtraSGPRs = 0; + if (VCCUsed) + ExtraSGPRs = 2; + + IsaVersion Version = getIsaVersion(Features); + if (Version.Major < 8) { + if (FlatScrUsed) + ExtraSGPRs = 4; + } else { + if (XNACKUsed) + ExtraSGPRs = 4; + + if (FlatScrUsed) + ExtraSGPRs = 6; + } + + return ExtraSGPRs; +} + +unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed, + bool FlatScrUsed) { + return getNumExtraSGPRs(Features, VCCUsed, FlatScrUsed, + Features[AMDGPU::FeatureXNACK]); +} + +unsigned getNumSGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs) { + NumSGPRs = alignTo(std::max(1u, NumSGPRs), getSGPREncodingGranule(Features)); + // SGPRBlocks is actual number of SGPR blocks minus 1. + return NumSGPRs / getSGPREncodingGranule(Features) - 1; +} + unsigned getVGPRAllocGranule(const FeatureBitset &Features) { return 4; } @@ -370,7 +388,7 @@ unsigned getAddressableNumVGPRs(const FeatureBitset &Features) { unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) { assert(WavesPerEU != 0); - if (WavesPerEU >= getMaxWavesPerEU(Features)) + if (WavesPerEU >= getMaxWavesPerEU()) return 0; unsigned MinNumVGPRs = alignDown(getTotalNumVGPRs(Features) / (WavesPerEU + 1), @@ -387,6 +405,12 @@ unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) { return std::min(MaxNumVGPRs, AddressableNumVGPRs); } +unsigned getNumVGPRBlocks(const FeatureBitset &Features, unsigned NumVGPRs) { + NumVGPRs = alignTo(std::max(1u, NumVGPRs), getVGPREncodingGranule(Features)); + // VGPRBlocks is actual number of VGPR blocks minus 1. + return NumVGPRs / getVGPREncodingGranule(Features) - 1; +} + } // end namespace IsaInfo void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, @@ -396,7 +420,7 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, memset(&Header, 0, sizeof(Header)); Header.amd_kernel_code_version_major = 1; - Header.amd_kernel_code_version_minor = 1; + Header.amd_kernel_code_version_minor = 2; Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU Header.amd_machine_version_major = ISA.Major; Header.amd_machine_version_minor = ISA.Minor; @@ -416,6 +440,21 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, Header.private_segment_alignment = 4; } +amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() { + amdhsa::kernel_descriptor_t KD; + memset(&KD, 0, sizeof(KD)); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, + amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP, 1); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE, 1); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, 1); + return KD; +} + bool isGroupSegment(const GlobalValue *GV) { return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; } @@ -425,7 +464,8 @@ bool isGlobalSegment(const GlobalValue *GV) { } bool isReadOnlySegment(const GlobalValue *GV) { - return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; + return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || + GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT; } bool shouldEmitConstantsToTextSection(const Triple &TT) { @@ -598,6 +638,18 @@ bool isEntryFunctionCC(CallingConv::ID CC) { } } +bool hasXNACK(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureXNACK]; +} + +bool hasMIMG_R128(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128]; +} + +bool hasPackedD16(const MCSubtargetInfo &STI) { + return !STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]; +} + bool isSI(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands]; } @@ -681,6 +733,8 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) { case node: return isGFX9(STI) ? node##_gfx9 : node##_vi; unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { + if (STI.getTargetTriple().getArch() == Triple::r600) + return Reg; MAP_REG2REG } @@ -837,9 +891,6 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) { assert(HasInv2Pi); - if (!EnablePackedInlinableLiterals) - return false; - int16_t Lo16 = static_cast<int16_t>(Literal); int16_t Hi16 = static_cast<int16_t>(Literal >> 16); return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi); @@ -871,24 +922,6 @@ bool isArgPassedInSGPR(const Argument *A) { } } -// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence. -bool isUniformMMO(const MachineMemOperand *MMO) { - const Value *Ptr = MMO->getValue(); - // UndefValue means this is a load of a kernel input. These are uniform. - // Sometimes LDS instructions have constant pointers. - // If Ptr is null, then that means this mem operand contains a - // PseudoSourceValue like GOT. - if (!Ptr || isa<UndefValue>(Ptr) || - isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) - return true; - - if (const Argument *Arg = dyn_cast<Argument>(Ptr)) - return isArgPassedInSGPR(Arg); - - const Instruction *I = dyn_cast<Instruction>(Ptr); - return I && I->getMetadata("amdgpu.uniform"); -} - int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) { if (isGCN3Encoding(ST)) return ByteOffset; @@ -909,18 +942,10 @@ namespace llvm { namespace AMDGPU { AMDGPUAS getAMDGPUAS(Triple T) { - auto Env = T.getEnvironmentName(); AMDGPUAS AS; - if (Env == "amdgiz" || Env == "amdgizcl") { - AS.FLAT_ADDRESS = 0; - AS.PRIVATE_ADDRESS = 5; - AS.REGION_ADDRESS = 4; - } - else { - AS.FLAT_ADDRESS = 4; - AS.PRIVATE_ADDRESS = 0; - AS.REGION_ADDRESS = 5; - } + AS.FLAT_ADDRESS = 0; + AS.PRIVATE_ADDRESS = 5; + AS.REGION_ADDRESS = 2; return AS; } @@ -931,5 +956,21 @@ AMDGPUAS getAMDGPUAS(const TargetMachine &M) { AMDGPUAS getAMDGPUAS(const Module &M) { return getAMDGPUAS(Triple(M.getTargetTriple())); } + +namespace { + +struct SourceOfDivergence { + unsigned Intr; +}; +const SourceOfDivergence *lookupSourceOfDivergence(unsigned Intr); + +#define GET_SourcesOfDivergence_IMPL +#include "AMDGPUGenSearchableTables.inc" + +} // end anonymous namespace + +bool isIntrinsicSourceOfDivergence(unsigned IntrID) { + return lookupSourceOfDivergence(IntrID); +} } // namespace AMDGPU } // namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index a215b445378e..70681c271697 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -16,6 +16,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/IR/CallingConv.h" #include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include <cstdint> @@ -28,24 +29,31 @@ class Argument; class FeatureBitset; class Function; class GlobalValue; -class MachineMemOperand; class MCContext; class MCRegisterClass; class MCRegisterInfo; class MCSection; class MCSubtargetInfo; +class MachineMemOperand; class Triple; namespace AMDGPU { + +#define GET_MIMGBaseOpcode_DECL +#define GET_MIMGDim_DECL +#define GET_MIMGEncoding_DECL +#include "AMDGPUGenSearchableTables.inc" + namespace IsaInfo { enum { // The closed Vulkan driver sets 96, which limits the wave count to 8 but // doesn't spill SGPRs as much as when 80 is set. - FIXED_NUM_SGPRS_FOR_INIT_BUG = 96 + FIXED_NUM_SGPRS_FOR_INIT_BUG = 96, + TRAP_NUM_SGPRS = 16 }; -/// \brief Instruction set architecture version. +/// Instruction set architecture version. struct IsaVersion { unsigned Major; unsigned Minor; @@ -55,12 +63,12 @@ struct IsaVersion { /// \returns Isa version for given subtarget \p Features. IsaVersion getIsaVersion(const FeatureBitset &Features); -/// \brief Streams isa version string for given subtarget \p STI into \p Stream. +/// Streams isa version string for given subtarget \p STI into \p Stream. void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream); -/// \returns True if given subtarget \p Features support code object version 3, +/// \returns True if given subtarget \p STI supports code object version 3, /// false otherwise. -bool hasCodeObjectV3(const FeatureBitset &Features); +bool hasCodeObjectV3(const MCSubtargetInfo *STI); /// \returns Wavefront size for given subtarget \p Features. unsigned getWavefrontSize(const FeatureBitset &Features); @@ -92,7 +100,7 @@ unsigned getMinWavesPerEU(const FeatureBitset &Features); /// \returns Maximum number of waves per execution unit for given subtarget \p /// Features without any kind of limitation. -unsigned getMaxWavesPerEU(const FeatureBitset &Features); +unsigned getMaxWavesPerEU(); /// \returns Maximum number of waves per execution unit for given subtarget \p /// Features and limited by given \p FlatWorkGroupSize. @@ -131,6 +139,22 @@ unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU); unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU, bool Addressable); +/// \returns Number of extra SGPRs implicitly required by given subtarget \p +/// Features when the given special registers are used. +unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed, + bool FlatScrUsed, bool XNACKUsed); + +/// \returns Number of extra SGPRs implicitly required by given subtarget \p +/// Features when the given special registers are used. XNACK is inferred from +/// \p Features. +unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed, + bool FlatScrUsed); + +/// \returns Number of SGPR blocks needed for given subtarget \p Features when +/// \p NumSGPRs are used. \p NumSGPRs should already include any special +/// register counts. +unsigned getNumSGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs); + /// \returns VGPR allocation granularity for given subtarget \p Features. unsigned getVGPRAllocGranule(const FeatureBitset &Features); @@ -151,20 +175,57 @@ unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU); /// execution unit requirement for given subtarget \p Features. unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU); +/// \returns Number of VGPR blocks needed for given subtarget \p Features when +/// \p NumVGPRs are used. +unsigned getNumVGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs); + } // end namespace IsaInfo LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx); +struct MIMGBaseOpcodeInfo { + MIMGBaseOpcode BaseOpcode; + bool Store; + bool Atomic; + bool AtomicX2; + bool Sampler; + + uint8_t NumExtraArgs; + bool Gradients; + bool Coordinates; + bool LodOrClampOrMip; + bool HasD16; +}; + +LLVM_READONLY +const MIMGBaseOpcodeInfo *getMIMGBaseOpcodeInfo(unsigned BaseOpcode); + +struct MIMGDimInfo { + MIMGDim Dim; + uint8_t NumCoords; + uint8_t NumGradients; + bool DA; +}; + LLVM_READONLY -int getMaskedMIMGOp(const MCInstrInfo &MII, - unsigned Opc, unsigned NewChannels); +const MIMGDimInfo *getMIMGDimInfo(unsigned Dim); + +LLVM_READONLY +int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, + unsigned VDataDwords, unsigned VAddrDwords); + +LLVM_READONLY +int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels); + LLVM_READONLY int getMCOpcode(uint16_t Opcode, unsigned Gen); void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const FeatureBitset &Features); +amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(); + bool isGroupSegment(const GlobalValue *GV); bool isGlobalSegment(const GlobalValue *GV); bool isReadOnlySegment(const GlobalValue *GV); @@ -216,7 +277,7 @@ unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt); /// \returns Decoded Lgkmcnt from given \p Waitcnt for given isa \p Version. unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt); -/// \brief Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa +/// Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa /// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and /// \p Lgkmcnt respectively. /// @@ -240,7 +301,7 @@ unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, unsigned Lgkmcnt); -/// \brief Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa +/// Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa /// \p Version. /// /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows: @@ -278,41 +339,45 @@ inline bool isKernel(CallingConv::ID CC) { } } +bool hasXNACK(const MCSubtargetInfo &STI); +bool hasMIMG_R128(const MCSubtargetInfo &STI); +bool hasPackedD16(const MCSubtargetInfo &STI); + bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); bool isVI(const MCSubtargetInfo &STI); bool isGFX9(const MCSubtargetInfo &STI); -/// \brief Is Reg - scalar register +/// Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); -/// \brief Is there any intersection between registers +/// Is there any intersection between registers bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI); /// If \p Reg is a pseudo reg, return the correct hardware register given /// \p STI otherwise return \p Reg. unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI); -/// \brief Convert hardware register \p Reg to a pseudo register +/// Convert hardware register \p Reg to a pseudo register LLVM_READNONE unsigned mc2PseudoReg(unsigned Reg); -/// \brief Can this operand also contain immediate values? +/// Can this operand also contain immediate values? bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo); -/// \brief Is this floating-point operand? +/// Is this floating-point operand? bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo); -/// \brief Does this opearnd support only inlinable literals? +/// Does this opearnd support only inlinable literals? bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo); -/// \brief Get the size in bits of a register from the register class \p RC. +/// Get the size in bits of a register from the register class \p RC. unsigned getRegBitWidth(unsigned RCID); -/// \brief Get the size in bits of a register from the register class \p RC. +/// Get the size in bits of a register from the register class \p RC. unsigned getRegBitWidth(const MCRegisterClass &RC); -/// \brief Get size of register operand +/// Get size of register operand unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc, unsigned OpNo); @@ -349,7 +414,7 @@ inline unsigned getOperandSize(const MCInstrDesc &Desc, unsigned OpNo) { return getOperandSize(Desc.OpInfo[OpNo]); } -/// \brief Is this literal inlinable +/// Is this literal inlinable LLVM_READNONE bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi); @@ -363,7 +428,6 @@ LLVM_READNONE bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi); bool isArgPassedInSGPR(const Argument *Arg); -bool isUniformMMO(const MachineMemOperand *MMO); /// \returns The encoding that will be used for \p ByteOffset in the SMRD /// offset field. @@ -374,6 +438,9 @@ int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); /// not the encoded offset. bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); +/// \returns true if the intrinsic is divergent +bool isIntrinsicSourceOfDivergence(unsigned IntrID); + } // end namespace AMDGPU } // end namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp new file mode 100644 index 000000000000..1924f71f11c8 --- /dev/null +++ b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp @@ -0,0 +1,75 @@ +//===-- AMDGPULaneDominator.cpp - Determine Lane Dominators ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// MBB A lane-dominates MBB B if +// 1. A dominates B in the usual sense, i.e. every path from the entry to B +// goes through A, and +// 2. whenever B executes, every active lane during that execution of B was +// also active during the most recent execution of A. +// +// The simplest example where A dominates B but does not lane-dominate it is +// where A is a loop: +// +// | +// +--+ +// A | +// +--+ +// | +// B +// +// Unfortunately, the second condition is not fully captured by the control +// flow graph when it is unstructured (as may happen when branch conditions are +// uniform). +// +// The following replacement of the second condition is a conservative +// approximation. It is an equivalent condition when the CFG is fully +// structured: +// +// 2'. every cycle in the CFG that contains A also contains B. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPULaneDominator.h" + +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" + +namespace llvm { + +namespace AMDGPU { + +// Given machine basic blocks A and B where A dominates B, check whether +// A lane-dominates B. +// +// The check is conservative, i.e. there can be false-negatives. +bool laneDominates(MachineBasicBlock *A, MachineBasicBlock *B) { + // Check whether A is reachable from itself without going through B. + DenseSet<MachineBasicBlock *> Reachable; + SmallVector<MachineBasicBlock *, 8> Stack; + + Stack.push_back(A); + do { + MachineBasicBlock *MBB = Stack.back(); + Stack.pop_back(); + + for (MachineBasicBlock *Succ : MBB->successors()) { + if (Succ == A) + return false; + if (Succ != B && Reachable.insert(Succ).second) + Stack.push_back(Succ); + } + } while (!Stack.empty()); + + return true; +} + +} // namespace AMDGPU + +} // namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h new file mode 100644 index 000000000000..4f33a89a364b --- /dev/null +++ b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h @@ -0,0 +1,24 @@ +//===- AMDGPULaneDominator.h ------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H +#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H + +namespace llvm { + +class MachineBasicBlock; + +namespace AMDGPU { + +bool laneDominates(MachineBasicBlock *MBBA, MachineBasicBlock *MBBB); + +} // end namespace AMDGPU +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h index 991408c81c92..9f0a4d29b5e4 100644 --- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h @@ -73,7 +73,6 @@ FIELD2(amd_machine_version_stepping, machine_version_stepping, amd_machine_ve FIELD(kernel_code_entry_byte_offset), FIELD(kernel_code_prefetch_byte_size), -FIELD(max_scratch_backing_memory_byte_size), COMPPGM1(granulated_workitem_vgpr_count, compute_pgm_rsrc1_vgprs, VGPRS), COMPPGM1(granulated_wavefront_sgpr_count, compute_pgm_rsrc1_sgprs, SGPRS), diff --git a/lib/Target/AMDGPU/Utils/CMakeLists.txt b/lib/Target/AMDGPU/Utils/CMakeLists.txt index 01b80ebe8d3d..c5ed32e46821 100644 --- a/lib/Target/AMDGPU/Utils/CMakeLists.txt +++ b/lib/Target/AMDGPU/Utils/CMakeLists.txt @@ -2,4 +2,5 @@ add_llvm_library(LLVMAMDGPUUtils AMDGPUBaseInfo.cpp AMDKernelCodeTUtils.cpp AMDGPUAsmUtils.cpp + AMDGPULaneDominator.cpp ) diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td index ff2bd2454400..4c7a92219755 100644 --- a/lib/Target/AMDGPU/VOP1Instructions.td +++ b/lib/Target/AMDGPU/VOP1Instructions.td @@ -40,17 +40,9 @@ class VOP1_SDWA9Ae <bits<8> op, VOPProfile P> : VOP_SDWA9Ae <P> { } class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1Only = 0> : - InstSI <P.Outs32, P.Ins32, "", pattern>, - VOP <opName>, - SIMCInstr <!if(VOP1Only, opName, opName#"_e32"), SIEncodingFamily.NONE>, - MnemonicAlias<!if(VOP1Only, opName, opName#"_e32"), opName> { + VOP_Pseudo <opName, !if(VOP1Only, "", "_e32"), P, P.Outs32, P.Ins32, "", pattern> { - let isPseudo = 1; - let isCodeGenOnly = 1; - let UseNamedOperandTable = 1; - - string Mnemonic = opName; - string AsmOperands = P.Asm32; + let AsmOperands = P.Asm32; let Size = 4; let mayLoad = 0; @@ -63,8 +55,6 @@ class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1On let Uses = [EXEC]; let AsmVariantName = AMDGPUAsmVariants.Default; - - VOPProfile Pfl = P; } class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> : @@ -86,6 +76,7 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> : let TSFlags = ps.TSFlags; let UseNamedOperandTable = ps.UseNamedOperandTable; let Uses = ps.Uses; + let Defs = ps.Defs; } class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : @@ -202,13 +193,14 @@ defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>; defm V_CEIL_F32 : VOP1Inst <"v_ceil_f32", VOP_F32_F32, fceil>; defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>; defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>; -defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>; let SchedRW = [WriteQuarterRate32] in { +defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>; defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>; defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>; -defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32>; +defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>; defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>; +defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>; } // End SchedRW = [WriteQuarterRate32] let SchedRW = [WriteDouble] in { @@ -216,8 +208,6 @@ defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>; defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>; } // End SchedRW = [WriteDouble]; -defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>; - let SchedRW = [WriteDouble] in { defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, fsqrt>; } // End SchedRW = [WriteDouble] @@ -232,9 +222,9 @@ defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32>; defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32>; defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>; defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>; -defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>; let SchedRW = [WriteDoubleAdd] in { +defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>; defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>; defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>; } // End SchedRW = [WriteDoubleAdd] @@ -298,9 +288,7 @@ defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_I32_VI32_NO_EXT>; defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>; } // End Uses = [M0, EXEC] -let SchedRW = [WriteQuarterRate32] in { defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>; -} // These instruction only exist on SI and CI let SubtargetPredicate = isSICI in { @@ -344,11 +332,15 @@ defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>; defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>; defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>; defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>; +let SchedRW = [WriteQuarterRate32] in { defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>; defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, fsqrt>; defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>; defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>; defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>; +defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>; +defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; +} // End SchedRW = [WriteQuarterRate32] defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>; defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>; @@ -356,8 +348,6 @@ defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>; defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>; defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>; defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>; -defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>; -defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; } @@ -392,6 +382,12 @@ let SubtargetPredicate = isGFX9 in { def V_SWAP_B32 : VOP1_Pseudo <"v_swap_b32", VOP_SWAP_I32, [], 1>; } +defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>; + +defm V_SAT_PK_U8_I16 : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>; +defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>; +defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>; + } // End SubtargetPredicate = isGFX9 //===----------------------------------------------------------------------===// @@ -521,7 +517,7 @@ multiclass VOP1Only_Real_vi <bits<10> op> { } } -multiclass VOP1_Real_vi <bits<10> op> { +multiclass VOP1_Real_e32e64_vi <bits<10> op> { let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { def _e32_vi : VOP1_Real<!cast<VOP1_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>, @@ -530,6 +526,10 @@ multiclass VOP1_Real_vi <bits<10> op> { VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, VOP3e_vi <!add(0x140, op), !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; } +} + +multiclass VOP1_Real_vi <bits<10> op> { + defm NAME : VOP1_Real_e32e64_vi <op>; def _sdwa_vi : VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, @@ -593,9 +593,9 @@ defm V_FRACT_F64 : VOP1_Real_vi <0x32>; defm V_FREXP_EXP_I32_F32 : VOP1_Real_vi <0x33>; defm V_FREXP_MANT_F32 : VOP1_Real_vi <0x34>; defm V_CLREXCP : VOP1_Real_vi <0x35>; -defm V_MOVRELD_B32 : VOP1_Real_vi <0x36>; -defm V_MOVRELS_B32 : VOP1_Real_vi <0x37>; -defm V_MOVRELSD_B32 : VOP1_Real_vi <0x38>; +defm V_MOVRELD_B32 : VOP1_Real_e32e64_vi <0x36>; +defm V_MOVRELS_B32 : VOP1_Real_e32e64_vi <0x37>; +defm V_MOVRELSD_B32 : VOP1_Real_e32e64_vi <0x38>; defm V_TRUNC_F64 : VOP1_Real_vi <0x17>; defm V_CEIL_F64 : VOP1_Real_vi <0x18>; defm V_FLOOR_F64 : VOP1_Real_vi <0x1A>; @@ -622,6 +622,10 @@ defm V_SIN_F16 : VOP1_Real_vi <0x49>; defm V_COS_F16 : VOP1_Real_vi <0x4a>; defm V_SWAP_B32 : VOP1Only_Real_vi <0x51>; +defm V_SAT_PK_U8_I16 : VOP1_Real_vi<0x4f>; +defm V_CVT_NORM_I16_F16 : VOP1_Real_vi<0x4d>; +defm V_CVT_NORM_U16_F16 : VOP1_Real_vi<0x4e>; + // Copy of v_mov_b32 with $vdst as a use operand for use with VGPR // indexing mode. vdst can't be treated as a def for codegen purposes, // and an implicit use and def of the super register should be added. @@ -694,3 +698,23 @@ def : GCNPat < >; } // End OtherPredicates = [isVI] + +//===----------------------------------------------------------------------===// +// GFX9 +//===----------------------------------------------------------------------===// + +multiclass VOP1_Real_gfx9 <bits<10> op> { + let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in { + defm NAME : VOP1_Real_e32e64_vi <op>; + } + + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, + VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; + + // For now left dpp only for asm/dasm + // TODO: add corresponding pseudo + def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>; +} + +defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>; diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index ef90b68db1a8..5ec1a15c5cd2 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -61,17 +61,9 @@ class VOP2_SDWA9Ae <bits<6> op, VOPProfile P> : VOP_SDWA9Ae <P> { } class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> : - InstSI <P.Outs32, P.Ins32, "", pattern>, - VOP <opName>, - SIMCInstr <opName#suffix, SIEncodingFamily.NONE>, - MnemonicAlias<opName#suffix, opName> { + VOP_Pseudo <opName, suffix, P, P.Outs32, P.Ins32, "", pattern> { - let isPseudo = 1; - let isCodeGenOnly = 1; - let UseNamedOperandTable = 1; - - string Mnemonic = opName; - string AsmOperands = P.Asm32; + let AsmOperands = P.Asm32; let Size = 4; let mayLoad = 0; @@ -84,8 +76,6 @@ class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suf let Uses = [EXEC]; let AsmVariantName = AMDGPUAsmVariants.Default; - - VOPProfile Pfl = P; } class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> : @@ -107,6 +97,7 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> : let TSFlags = ps.TSFlags; let UseNamedOperandTable = ps.UseNamedOperandTable; let Uses = ps.Uses; + let Defs = ps.Defs; } class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : @@ -177,6 +168,10 @@ multiclass VOP2eInst <string opName, let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]) in { def _e32 : VOP2_Pseudo <opName, P>, Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; + + def _sdwa : VOP2_SDWA_Pseudo <opName, P> { + let AsmMatchConverter = "cvtSdwaVOP2b"; + } } def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, @@ -303,12 +298,30 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { let Src0RC32 = VCSrc_b32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above. let Asm32 = "$vdst, $src0, $src1, vcc"; let Asm64 = "$vdst, $src0, $src1, $src2"; + let AsmSDWA = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; + let AsmSDWA9 = "$vdst, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel"; + let AsmDPP = "$vdst, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; + let Outs32 = (outs DstRC:$vdst); let Outs64 = (outs DstRC:$vdst); // Suppress src2 implied by type since the 32-bit encoding uses an // implicit VCC use. let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); + + let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, + Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, + clampmod:$clamp, + dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel); + + let InsDPP = (ins DstRCDPP:$old, + Src0DPP:$src0, + Src1DPP:$src1, + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let HasExt = 1; + let HasSDWA9 = 1; } def VOP_READLANE : VOPProfile<[i32, i32, i32]> { @@ -322,15 +335,17 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> { let HasSDWA9 = 0; } -def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> { +def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> { let Outs32 = (outs VGPR_32:$vdst); let Outs64 = Outs32; - let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1); + let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in); let Ins64 = Ins32; let Asm32 = " $vdst, $src0, $src1"; let Asm64 = Asm32; let HasExt = 0; let HasSDWA9 = 0; + let HasSrc2 = 0; + let HasSrc2Mods = 0; } //===----------------------------------------------------------------------===// @@ -398,7 +413,10 @@ let isConvergent = 1, Uses = []<Register> in { def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))], "">; -def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">; +let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { +def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, + [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))], "">; +} // End $vdst = $vdst_in, DisableEncoding $vdst_in } // End isConvergent = 1 defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>; @@ -407,11 +425,11 @@ defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32 defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>; defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>; defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst" -defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>; -defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>; +defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_i16_f32>; +defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_u16_f32>; defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>; -defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>>; -defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>>; +defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_u16_u32>; +defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_i16_i32>; } // End SubtargetPredicate = isGCN @@ -473,6 +491,19 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>; } // End SubtargetPredicate = Has16BitInsts +let SubtargetPredicate = HasDLInsts in { + +defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32>; + +let Constraints = "$vdst = $src2", + DisableEncoding="$src2", + isConvertibleToThreeAddress = 1, + isCommutable = 1 in { +defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>; +} + +} // End SubtargetPredicate = HasDLInsts + // Note: 16-bit instructions produce a 0 result in the high 16-bits. multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> { @@ -639,7 +670,7 @@ defm V_SUBBREV_U32 : VOP2be_Real_e32e64_si <0x2a>; defm V_READLANE_B32 : VOP2_Real_si <0x01>; -let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1) in { +let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in { defm V_WRITELANE_B32 : VOP2_Real_si <0x02>; } @@ -824,7 +855,7 @@ multiclass VOP2_Real_e32e64_vi <bits<6> op> : def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; } -defm V_CNDMASK_B32 : Base_VOP2_Real_e32e64_vi <0x0>; +defm V_CNDMASK_B32 : VOP2_Real_e32e64_vi <0x0>; defm V_ADD_F32 : VOP2_Real_e32e64_vi <0x1>; defm V_SUB_F32 : VOP2_Real_e32e64_vi <0x2>; defm V_SUBREV_F32 : VOP2_Real_e32e64_vi <0x3>; @@ -926,3 +957,10 @@ def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>; def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>; } // End SubtargetPredicate = isVI + +let SubtargetPredicate = HasDLInsts in { + +defm V_FMAC_F32 : VOP2_Real_e32e64_vi <0x3b>; +defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>; + +} // End SubtargetPredicate = HasDLInsts diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index aedbfa015bf6..17ae08dc6267 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -153,19 +153,24 @@ class getVOP3VCC<VOPProfile P, SDPatternOperator node> { (i1 VCC)))]; } -class VOP3Features<bit Clamp, bit OpSel> { +class VOP3Features<bit Clamp, bit OpSel, bit Packed> { bit HasClamp = Clamp; bit HasOpSel = OpSel; + bit IsPacked = Packed; } -def VOP3_REGULAR : VOP3Features<0, 0>; -def VOP3_CLAMP : VOP3Features<1, 0>; -def VOP3_OPSEL : VOP3Features<1, 1>; +def VOP3_REGULAR : VOP3Features<0, 0, 0>; +def VOP3_CLAMP : VOP3Features<1, 0, 0>; +def VOP3_OPSEL : VOP3Features<1, 1, 0>; +def VOP3_PACKED : VOP3Features<1, 1, 1>; class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> { let HasClamp = !if(Features.HasClamp, 1, P.HasClamp); let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel); + let IsPacked = !if(Features.IsPacked, 1, P.IsPacked); + + let HasModifiers = !if(Features.IsPacked, 1, P.HasModifiers); // FIXME: Hack to stop printing _e64 let Outs64 = (outs DstRC.RegClass:$vdst); @@ -283,10 +288,10 @@ def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>; def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>; -def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>; def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>; let SchedRW = [WriteDoubleAdd] in { +def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>; def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>; def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>; def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>; @@ -355,14 +360,12 @@ def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPU def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> { let SchedRW = [WriteFloatFMA, WriteSALU]; - let hasExtraSrcRegAllocReq = 1; let AsmMatchConverter = ""; } // Double precision division pre-scale. def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> { let SchedRW = [WriteDouble, WriteSALU]; - let hasExtraSrcRegAllocReq = 1; let AsmMatchConverter = ""; } @@ -376,6 +379,7 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I3 let SchedRW = [WriteDouble]; } +let SchedRW = [Write64Bit] in { // These instructions only exist on SI and CI let SubtargetPredicate = isSICI in { def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>>; @@ -389,17 +393,17 @@ def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>>; def V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>>; def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>; } // End SubtargetPredicate = isVI - +} // End SchedRW = [Write64Bit] let SubtargetPredicate = isCIVI in { -let Constraints = "@earlyclobber $vdst" in { +let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in { def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>; def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP>>; -} // End Constraints = "@earlyclobber $vdst" +} // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] let isCommutable = 1 in { -let SchedRW = [WriteDouble, WriteSALU] in { +let SchedRW = [WriteQuarterRate32, WriteSALU] in { def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>; def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; } // End SchedRW = [WriteDouble, WriteSALU] @@ -408,16 +412,16 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; } // End SubtargetPredicate = isCIVI -let SubtargetPredicate = Has16BitInsts in { - -let renamedInGFX9 = 1 in { -def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>; +def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup> { + let Predicates = [Has16BitInsts, isVIOnly]; } -let SubtargetPredicate = isGFX9 in { -def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>; +def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9", + VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup> { + let renamedInGFX9 = 1; + let Predicates = [Has16BitInsts, isGFX9]; } -let isCommutable = 1 in { +let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in { let renamedInGFX9 = 1 in { def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>; @@ -438,15 +442,14 @@ def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f1 def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>; def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>; -} // End isCommutable = 1 -} // End SubtargetPredicate = Has16BitInsts +} // End SubtargetPredicate = Has16BitInsts, isCommutable = 1 let SubtargetPredicate = isVI in { def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>; def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>; def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>; -def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUperm>; } // End SubtargetPredicate = isVI let Predicates = [Has16BitInsts] in { @@ -697,7 +700,7 @@ multiclass VOP3Interp_F16_Real_vi<bits<10> op> { let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in { multiclass VOP3_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> { - def _vi : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>, + def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>, VOP3e_vi <op, !cast<VOP3_Pseudo>(OpName).Pfl> { VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName); let AsmString = AsmName # ps.AsmOperands; @@ -705,7 +708,7 @@ multiclass VOP3_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> { } multiclass VOP3OpSel_F16_Real_gfx9<bits<10> op, string AsmName> { - def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>, + def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>, VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(NAME).Pfl> { VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME); let AsmString = AsmName # ps.AsmOperands; @@ -713,7 +716,7 @@ multiclass VOP3OpSel_F16_Real_gfx9<bits<10> op, string AsmName> { } multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> { - def _vi : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>, + def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>, VOP3Interp_vi <op, !cast<VOP3_Pseudo>(OpName).Pfl> { VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName); let AsmString = AsmName # ps.AsmOperands; @@ -721,9 +724,9 @@ multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> } multiclass VOP3_Real_gfx9<bits<10> op, string AsmName> { - def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>, - VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl> { - VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME); + def _gfx9 : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.GFX9>, + VOP3e_vi <op, !cast<VOP_Pseudo>(NAME).Pfl> { + VOP_Pseudo ps = !cast<VOP_Pseudo>(NAME); let AsmString = AsmName # ps.AsmOperands; } } diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td index eeee8b36c175..5c78ada3211e 100644 --- a/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/lib/Target/AMDGPU/VOP3PInstructions.td @@ -68,6 +68,67 @@ def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I1 def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>; def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>; +multiclass MadFmaMixPats<SDPatternOperator fma_like, + Instruction mix_inst, + Instruction mixlo_inst, + Instruction mixhi_inst> { + def : GCNPat < + (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), + (mixlo_inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.NONE, + (i32 (IMPLICIT_DEF))) + >; + + // FIXME: Special case handling for maxhi (especially for clamp) + // because dealing with the write to high half of the register is + // difficult. + def : GCNPat < + (build_vector f16:$elt0, (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), + (v2f16 (mixhi_inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.NONE, + $elt0)) + >; + + def : GCNPat < + (build_vector + f16:$elt0, + (AMDGPUclamp (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), + (v2f16 (mixhi_inst $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.ENABLE, + $elt0)) + >; + + def : GCNPat < + (AMDGPUclamp (build_vector + (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))), + (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))), + (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0, + $hi_src1_modifiers, $hi_src1, + $hi_src2_modifiers, $hi_src2, + DSTCLAMP.ENABLE, + (mixlo_inst $lo_src0_modifiers, $lo_src0, + $lo_src1_modifiers, $lo_src1, + $lo_src2_modifiers, $lo_src2, + DSTCLAMP.ENABLE, + (i32 (IMPLICIT_DEF))))) + >; +} let SubtargetPredicate = HasMadMixInsts in { // These are VOP3a-like opcodes which accept no omod. @@ -84,68 +145,41 @@ def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16 } } -def : GCNPat < - (f16 (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), - (V_MAD_MIXLO_F16 $src0_modifiers, $src0, - $src1_modifiers, $src1, - $src2_modifiers, $src2, - DSTCLAMP.NONE, - (i32 (IMPLICIT_DEF))) ->; +defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>; +} // End SubtargetPredicate = HasMadMixInsts -// FIXME: Special case handling for maxhi (especially for clamp) -// because dealing with the write to high half of the register is -// difficult. -def : GCNPat < - (build_vector f16:$elt0, (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), - (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0, - $src1_modifiers, $src1, - $src2_modifiers, $src2, - DSTCLAMP.NONE, - $elt0)) ->; -def : GCNPat < - (build_vector - f16:$elt0, - (AMDGPUclamp (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), - (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0, - $src1_modifiers, $src1, - $src2_modifiers, $src2, - DSTCLAMP.ENABLE, - $elt0)) ->; +// Essentially the same as the mad_mix versions +let SubtargetPredicate = HasFmaMixInsts in { +let isCommutable = 1 in { +def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; -def : GCNPat < - (AMDGPUclamp (build_vector - (fpround (fmad (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))), - (fpround (fmad (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))), - (v2f16 (V_MAD_MIXHI_F16 $hi_src0_modifiers, $hi_src0, - $hi_src1_modifiers, $hi_src1, - $hi_src2_modifiers, $hi_src2, - DSTCLAMP.ENABLE, - (V_MAD_MIXLO_F16 $lo_src0_modifiers, $lo_src0, - $lo_src1_modifiers, $lo_src1, - $lo_src2_modifiers, $lo_src2, - DSTCLAMP.ENABLE, - (i32 (IMPLICIT_DEF))))) ->; +// Clamp modifier is applied after conversion to f16. +def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; + +let ClampLo = 0, ClampHi = 1 in { +def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; +} +} + +defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>; +} -} // End SubtargetPredicate = [HasMadMixInsts] +let SubtargetPredicate = HasDLInsts in { + +def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, AMDGPUfdot2>; +def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2>; +def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2>; +def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4>; +def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4>; +def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8>; +def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8>; + +} // End SubtargetPredicate = HasDLInsts multiclass VOP3P_Real_vi<bits<10> op> { - def _vi : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.VI>, - VOP3Pe <op, !cast<VOP3P_Pseudo>(NAME).Pfl> { + def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, + VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> { let AssemblerPredicates = [HasVOP3PInsts]; let DecoderNamespace = "VI"; } @@ -172,6 +206,33 @@ defm V_PK_MUL_F16 : VOP3P_Real_vi <0x390>; defm V_PK_MIN_F16 : VOP3P_Real_vi <0x391>; defm V_PK_MAX_F16 : VOP3P_Real_vi <0x392>; + +let SubtargetPredicate = HasMadMixInsts in { defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x3a0>; defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x3a1>; defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x3a2>; +} + +let SubtargetPredicate = HasFmaMixInsts in { +let DecoderNamespace = "GFX9_DL" in { +// The mad_mix instructions were renamed and their behaviors changed, +// but the opcode stayed the same so we need to put these in a +// different DecoderNamespace to avoid the ambiguity. +defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x3a0>; +defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x3a1>; +defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>; +} +} + + +let SubtargetPredicate = HasDLInsts in { + +defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>; +defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>; +defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x3a7>; +defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x3a8>; +defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x3a9>; +defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x3aa>; +defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x3ab>; + +} // End SubtargetPredicate = HasDLInsts diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td index 146870e21531..cc6b8116afee 100644 --- a/lib/Target/AMDGPU/VOPCInstructions.td +++ b/lib/Target/AMDGPU/VOPCInstructions.td @@ -30,8 +30,8 @@ class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> { let Inst{31-25} = 0x3e; // encoding // VOPC disallows dst_sel and dst_unused as they have no effect on destination - let Inst{42-40} = SDWA.DWORD; - let Inst{44-43} = SDWA.UNUSED_PRESERVE; + let Inst{42-40} = 0; + let Inst{44-43} = 0; } class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> { @@ -106,6 +106,7 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> : let TSFlags = ps.TSFlags; let UseNamedOperandTable = ps.UseNamedOperandTable; let Uses = ps.Uses; + let Defs = ps.Defs; } class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td index f24ff5ce8dea..f0f7f259f71d 100644 --- a/lib/Target/AMDGPU/VOPInstructions.td +++ b/lib/Target/AMDGPU/VOPInstructions.td @@ -38,6 +38,23 @@ class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> : let Uses = [EXEC]; } +class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins, + string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern>, + VOP <opName>, + SIMCInstr <opName#suffix, SIEncodingFamily.NONE>, + MnemonicAlias<opName#suffix, opName> { + + let isPseudo = 1; + let isCodeGenOnly = 1; + let UseNamedOperandTable = 1; + + string Mnemonic = opName; + VOPProfile Pfl = P; + + string AsmOperands; +} + class VOP3Common <dag outs, dag ins, string asm = "", list<dag> pattern = [], bit HasMods = 0, bit VOP3Only = 0> : @@ -66,26 +83,18 @@ class VOP3Common <dag outs, dag ins, string asm = "", class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [], bit VOP3Only = 0, bit isVOP3P = 0, bit isVop3OpSel = 0> : - InstSI <P.Outs64, - !if(isVop3OpSel, - P.InsVOP3OpSel, - !if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64)), - "", - pattern>, - VOP <opName>, - SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>, - MnemonicAlias<opName#"_e64", opName> { + VOP_Pseudo <opName, "_e64", P, P.Outs64, + !if(isVop3OpSel, + P.InsVOP3OpSel, + !if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64)), + "", pattern> { - let isPseudo = 1; - let isCodeGenOnly = 1; - let UseNamedOperandTable = 1; let VOP3_OPSEL = isVop3OpSel; let IsPacked = P.IsPacked; - string Mnemonic = opName; - string AsmOperands = !if(isVop3OpSel, - P.AsmVOP3OpSel, - !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64)); + let AsmOperands = !if(isVop3OpSel, + P.AsmVOP3OpSel, + !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64)); let Size = 8; let mayLoad = 0; @@ -120,8 +129,6 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [], !if(!or(P.HasModifiers, !or(P.HasOMod, P.HasIntClamp)), "cvtVOP3", "")); - - VOPProfile Pfl = P; } class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> : @@ -129,7 +136,7 @@ class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> : let VOP3P = 1; } -class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> : +class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> : InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, SIMCInstr <ps.PseudoInstr, EncodingFamily> { @@ -149,13 +156,14 @@ class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> : let TSFlags = ps.TSFlags; let UseNamedOperandTable = ps.UseNamedOperandTable; let Uses = ps.Uses; + let Defs = ps.Defs; VOPProfile Pfl = ps.Pfl; } // XXX - Is there any reason to distingusih this from regular VOP3 // here? -class VOP3P_Real<VOP3P_Pseudo ps, int EncodingFamily> : +class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily> : VOP3_Real<ps, EncodingFamily>; class VOP3a<VOPProfile P> : Enc64 { @@ -324,13 +332,13 @@ class VOP_SDWAe<VOPProfile P> : Enc64 { bits<1> clamp; let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); - let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD); - let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE); + let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, 0); + let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, 0); let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0); - let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD); + let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, 0); let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0); let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0); - let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD); + let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, 0); let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0); let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0); } @@ -358,11 +366,11 @@ class VOP_SDWA9e<VOPProfile P> : Enc64 { bits<1> src1_sgpr; let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0); - let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD); + let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, 0); let Inst{51} = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0); let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0); let Inst{55} = !if(P.HasSrc0, src0{8}, 0); - let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD); + let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, 0); let Inst{59} = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0); let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0); let Inst{63} = 0; // src1_sgpr - should be specified in subclass @@ -375,8 +383,8 @@ class VOP_SDWA9Ae<VOPProfile P> : VOP_SDWA9e<P> { bits<1> clamp; bits<2> omod; - let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD); - let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE); + let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, 0); + let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, 0); let Inst{45} = !if(P.HasSDWAClamp, clamp{0}, 0); let Inst{47-46} = !if(P.HasSDWAOMod, omod{1-0}, 0); } |