diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2019-01-19 10:01:25 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2019-01-19 10:01:25 +0000 |
| commit | d8e91e46262bc44006913e6796843909f1ac7bcd (patch) | |
| tree | 7d0c143d9b38190e0fa0180805389da22cd834c5 /lib/Target/AMDGPU | |
| parent | b7eb8e35e481a74962664b63dfb09483b200209a (diff) | |
Notes
Diffstat (limited to 'lib/Target/AMDGPU')
126 files changed, 12013 insertions, 4897 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 796766d946221..bb7801c172f60 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -37,10 +37,13 @@ FunctionPass *createAMDGPUCFGStructurizerPass(); FunctionPass *createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel); // SI Passes +FunctionPass *createGCNDPPCombinePass(); FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSIPeepholeSDWAPass(); FunctionPass *createSILowerI1CopiesPass(); +FunctionPass *createSIFixupVectorISelPass(); +FunctionPass *createSIAddIMGInitPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(); FunctionPass *createSIWholeQuadModePass(); @@ -57,6 +60,7 @@ FunctionPass *createAMDGPUUseNativeCallsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); FunctionPass *createAMDGPURewriteOutArgumentsPass(); +FunctionPass *createSIModeRegisterPass(); void initializeAMDGPUDAGToDAGISelPass(PassRegistry&); @@ -69,10 +73,18 @@ Pass *createAMDGPUAnnotateKernelFeaturesPass(); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; +FunctionPass *createAMDGPUAtomicOptimizerPass(); +void initializeAMDGPUAtomicOptimizerPass(PassRegistry &); +extern char &AMDGPUAtomicOptimizerID; + ModulePass *createAMDGPULowerIntrinsicsPass(); void initializeAMDGPULowerIntrinsicsPass(PassRegistry &); extern char &AMDGPULowerIntrinsicsID; +ModulePass *createAMDGPUFixFunctionBitcastsPass(); +void initializeAMDGPUFixFunctionBitcastsPass(PassRegistry &); +extern char &AMDGPUFixFunctionBitcastsID; + FunctionPass *createAMDGPULowerKernelArgumentsPass(); void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &); extern char &AMDGPULowerKernelArgumentsID; @@ -84,6 +96,9 @@ extern char &AMDGPULowerKernelAttributesID; void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &); extern char &AMDGPURewriteOutArgumentsID; +void initializeGCNDPPCombinePass(PassRegistry &); +extern char &GCNDPPCombineID; + void initializeR600ClauseMergePassPass(PassRegistry &); extern char &R600ClauseMergePassID; @@ -114,6 +129,9 @@ extern char &SIFixSGPRCopiesID; void initializeSIFixVGPRCopiesPass(PassRegistry &); extern char &SIFixVGPRCopiesID; +void initializeSIFixupVectorISelPass(PassRegistry &); +extern char &SIFixupVectorISelID; + void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; @@ -141,6 +159,9 @@ extern char &AMDGPUSimplifyLibCallsID; void initializeAMDGPUUseNativeCallsPass(PassRegistry &); extern char &AMDGPUUseNativeCallsID; +void initializeSIAddIMGInitPass(PassRegistry &); +extern char &SIAddIMGInitID; + void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &); extern char &AMDGPUPerfHintAnalysisID; @@ -179,6 +200,9 @@ extern char &SIMemoryLegalizerID; void initializeSIDebuggerInsertNopsPass(PassRegistry&); extern char &SIDebuggerInsertNopsID; +void initializeSIModeRegisterPass(PassRegistry&); +extern char &SIModeRegisterID; + void initializeSIInsertWaitcntsPass(PassRegistry&); extern char &SIInsertWaitcntsID; @@ -190,6 +214,8 @@ extern char &AMDGPUUnifyDivergentExitNodesID; ImmutablePass *createAMDGPUAAWrapperPass(); void initializeAMDGPUAAWrapperPassPass(PassRegistry&); +ImmutablePass *createAMDGPUExternalAAWrapperPass(); +void initializeAMDGPUExternalAAWrapperPass(PassRegistry&); void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &); @@ -221,19 +247,18 @@ enum TargetIndex { /// however on the GPU, each address space points to /// a separate piece of memory that is unique from other /// memory locations. -struct AMDGPUAS { - // The following address space values depend on the triple environment. - unsigned PRIVATE_ADDRESS; ///< Address space for private memory. - unsigned FLAT_ADDRESS; ///< Address space for flat memory. - unsigned REGION_ADDRESS; ///< Address space for region memory. - +namespace AMDGPUAS { enum : unsigned { // The maximum value for flat, generic, local, private, constant and region. - MAX_COMMON_ADDRESS = 5, + MAX_AMDGPU_ADDRESS = 6, + FLAT_ADDRESS = 0, ///< Address space for flat memory. GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). + REGION_ADDRESS = 2, ///< Address space for region memory. + CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2) LOCAL_ADDRESS = 3, ///< Address space for local memory. + PRIVATE_ADDRESS = 5, ///< Address space for private memory. CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory @@ -268,14 +293,6 @@ struct AMDGPUAS { // Some places use this if the address space can't be determined. UNKNOWN_ADDRESS_SPACE = ~0u, }; -}; - -namespace llvm { -namespace AMDGPU { -AMDGPUAS getAMDGPUAS(const Module &M); -AMDGPUAS getAMDGPUAS(const TargetMachine &TM); -AMDGPUAS getAMDGPUAS(Triple T); -} // namespace AMDGPU -} // namespace llvm +} #endif diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 16c2a366db285..6a4cfe08e4910 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -11,6 +11,10 @@ include "llvm/TableGen/SearchableTable.td" include "llvm/Target/Target.td" include "AMDGPUFeatures.td" +class BoolToList<bit Value> { + list<int> ret = !if(Value, [1]<int>, []<int>); +} + //===------------------------------------------------------------===// // Subtarget Features (device properties) //===------------------------------------------------------------===// @@ -140,6 +144,12 @@ def FeatureCIInsts : SubtargetFeature<"ci-insts", "Additional instructions for CI+" >; +def FeatureVIInsts : SubtargetFeature<"vi-insts", + "VIInsts", + "true", + "Additional instructions for VI+" +>; + def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts", "GFX9Insts", "true", @@ -236,6 +246,12 @@ def FeatureDPP : SubtargetFeature<"dpp", "Support DPP (Data Parallel Primitives) extension" >; +def FeatureR128A16 : SubtargetFeature<"r128-a16", + "HasR128A16", + "true", + "Support 16 bit coordindates/gradients/lod/clamp/mip types on gfx9" +>; + def FeatureIntClamp : SubtargetFeature<"int-clamp-insts", "HasIntClamp", "true", @@ -251,31 +267,25 @@ def FeatureUnpackedD16VMem : SubtargetFeature<"unpacked-d16-vmem", def FeatureDLInsts : SubtargetFeature<"dl-insts", "HasDLInsts", "true", - "Has deep learning instructions" + "Has v_fmac_f32 and v_xnor_b32 instructions" >; -def FeatureD16PreservesUnusedBits : SubtargetFeature< - "d16-preserves-unused-bits", - "D16PreservesUnusedBits", +def FeatureDotInsts : SubtargetFeature<"dot-insts", + "HasDotInsts", "true", - "If present, then instructions defined by HasD16LoadStore predicate preserve " - "unused bits. Otherwise instructions defined by HasD16LoadStore predicate " - "zero unused bits." + "Has v_dot* instructions" +>; + +def FeatureSRAMECC : SubtargetFeature<"sram-ecc", + "EnableSRAMECC", + "true", + "Enable SRAM ECC" >; //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// -// Some instructions do not support denormals despite this flag. Using -// fp32 denormals also causes instructions to run at the double -// precision rate for the device. -def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals", - "FP32Denormals", - "true", - "Enable single precision denormal handling" ->; - // Denormal handling for fp64 and fp16 is controlled by the same // config register when fp16 supported. // TODO: Do we need a separate f16 setting when not legal? @@ -324,12 +334,6 @@ def FeatureEnableHugePrivateBuffer : SubtargetFeature< "Enable private/scratch buffer sizes greater than 128 GB" >; -def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", - "EnableVGPRSpilling", - "true", - "Enable spilling of VGPRs to scratch memory" ->; - def FeatureDumpCode : SubtargetFeature <"DumpCode", "DumpCode", "true", @@ -373,6 +377,16 @@ def FeatureEnableDS128 : SubtargetFeature<"enable-ds128", "Use ds_{read|write}_b128" >; +// Sparse texture support requires that all result registers are zeroed when +// PRTStrictNull is set to true. This feature is turned on for all architectures +// but is enabled as a feature in case there are situations where PRTStrictNull +// is disabled by the driver. +def FeatureEnablePRTStrictNull : SubtargetFeature<"enable-prt-strict-null", + "EnablePRTStrictNull", + "true", + "Enable zeroing of result registers for sparse texture fetches" +>; + // Unless +-flat-for-global is specified, turn on FlatForGlobal for // all OS-es on VI and newer hardware to avoid assertion failures due // to missing ADDR64 variants of MUBUF instructions. @@ -399,6 +413,12 @@ def FeatureCodeObjectV3 : SubtargetFeature < "Generate code object version 3" >; +def FeatureTrigReducedRange : SubtargetFeature<"trig-reduced-range", + "HasTrigReducedRange", + "true", + "Requires use of fract on arguments to trig instructions" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", @@ -418,36 +438,36 @@ class GCNSubtargetFeatureGeneration <string Value, def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureGCN, - FeatureLDSBankCount32, FeatureMovrel] + FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange] >; def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace, - FeatureCIInsts, FeatureMovrel] + FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange] >; def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, - FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, + FeatureGCN3Encoding, FeatureCIInsts, FeatureVIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP, - FeatureIntClamp + FeatureIntClamp, FeatureTrigReducedRange ] >; def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", [FeatureFP64, FeatureLocalMemorySize65536, FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, - FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, + FeatureGCN3Encoding, FeatureCIInsts, FeatureVIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, - FeatureAddNoCarryInsts, FeatureScalarAtomics + FeatureAddNoCarryInsts, FeatureScalarAtomics, FeatureR128A16 ] >; @@ -465,34 +485,41 @@ def FeatureISAVersion6_0_0 : SubtargetFeatureISAVersion <6,0,0, [FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops, - FeatureLDSBankCount32]>; + FeatureLDSBankCount32, + FeatureCodeObjectV3]>; def FeatureISAVersion6_0_1 : SubtargetFeatureISAVersion <6,0,1, [FeatureSouthernIslands, - FeatureLDSBankCount32]>; + FeatureLDSBankCount32, + FeatureCodeObjectV3]>; def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0, [FeatureSeaIslands, - FeatureLDSBankCount32]>; + FeatureLDSBankCount32, + FeatureCodeObjectV3]>; def FeatureISAVersion7_0_1 : SubtargetFeatureISAVersion <7,0,1, [FeatureSeaIslands, HalfRate64Ops, FeatureLDSBankCount32, - FeatureFastFMAF32]>; + FeatureFastFMAF32, + FeatureCodeObjectV3]>; def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2, [FeatureSeaIslands, FeatureLDSBankCount16, - FeatureFastFMAF32]>; + FeatureFastFMAF32, + FeatureCodeObjectV3]>; def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3, [FeatureSeaIslands, - FeatureLDSBankCount16]>; + FeatureLDSBankCount16, + FeatureCodeObjectV3]>; def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4, [FeatureSeaIslands, - FeatureLDSBankCount32]>; + FeatureLDSBankCount32, + FeatureCodeObjectV3]>; def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1, [FeatureVolcanicIslands, @@ -500,49 +527,63 @@ def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1, HalfRate64Ops, FeatureLDSBankCount32, FeatureXNACK, - FeatureUnpackedD16VMem]>; + FeatureUnpackedD16VMem, + FeatureCodeObjectV3]>; def FeatureISAVersion8_0_2 : SubtargetFeatureISAVersion <8,0,2, [FeatureVolcanicIslands, FeatureLDSBankCount32, FeatureSGPRInitBug, - FeatureUnpackedD16VMem]>; + FeatureUnpackedD16VMem, + FeatureCodeObjectV3]>; def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3, [FeatureVolcanicIslands, FeatureLDSBankCount32, - FeatureUnpackedD16VMem]>; + FeatureUnpackedD16VMem, + FeatureCodeObjectV3]>; def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0, [FeatureVolcanicIslands, FeatureLDSBankCount16, - FeatureXNACK]>; + FeatureXNACK, + FeatureCodeObjectV3]>; def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0, [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, - FeatureD16PreservesUnusedBits]>; + FeatureCodeObjectV3]>; def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2, [FeatureGFX9, FeatureMadMixInsts, FeatureLDSBankCount32, FeatureXNACK, - FeatureD16PreservesUnusedBits]>; + FeatureCodeObjectV3]>; def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4, [FeatureGFX9, FeatureLDSBankCount32, FeatureFmaMixInsts, - FeatureD16PreservesUnusedBits]>; + FeatureCodeObjectV3]>; def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6, [FeatureGFX9, HalfRate64Ops, FeatureFmaMixInsts, FeatureLDSBankCount32, - FeatureDLInsts]>; + FeatureDLInsts, + FeatureDotInsts, + FeatureSRAMECC, + FeatureCodeObjectV3]>; + +def FeatureISAVersion9_0_9 : SubtargetFeatureISAVersion <9,0,9, + [FeatureGFX9, + FeatureMadMixInsts, + FeatureLDSBankCount32, + FeatureXNACK, + FeatureCodeObjectV3]>; //===----------------------------------------------------------------------===// // Debugger related subtarget features. @@ -674,8 +715,9 @@ def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">, def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">, AssemblerPredicate<"!FeatureUnpackedD16VMem">; -def D16PreservesUnusedBits : Predicate<"Subtarget->d16PreservesUnusedBits()">, - AssemblerPredicate<"FeatureD16PreservesUnusedBits">; +def D16PreservesUnusedBits : + Predicate<"Subtarget->hasD16LoadStore() && !Subtarget->isSRAMECCEnabled()">, + AssemblerPredicate<"FeatureGFX9Insts,!FeatureSRAMECC">; def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">; def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">; @@ -683,10 +725,10 @@ def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">; def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">, AssemblerPredicate<"FeatureGFX9Insts">; -def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarryInsts()">, +def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">, AssemblerPredicate<"FeatureAddNoCarryInsts">; -def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarryInsts()">, +def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">, AssemblerPredicate<"!FeatureAddNoCarryInsts">; def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">, @@ -706,6 +748,9 @@ def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">, def HasDPP : Predicate<"Subtarget->hasDPP()">, AssemblerPredicate<"FeatureDPP">; +def HasR128A16 : Predicate<"Subtarget->hasR128A16()">, + AssemblerPredicate<"FeatureR128A16">; + def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">, AssemblerPredicate<"FeatureIntClamp">; @@ -728,6 +773,9 @@ def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">, def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">, AssemblerPredicate<"FeatureDLInsts">; +def HasDotInsts : Predicate<"Subtarget->hasDotInsts()">, + AssemblerPredicate<"FeatureDotInsts">; + def EnableLateCFGStructurize : Predicate< "EnableLateStructurizeCFG">; @@ -736,7 +784,6 @@ def EnableLateCFGStructurize : Predicate< include "SISchedule.td" include "GCNProcessors.td" include "AMDGPUInstrInfo.td" -include "AMDGPUIntrinsics.td" include "SIIntrinsics.td" include "AMDGPURegisterInfo.td" include "AMDGPURegisterBanks.td" diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp index ef4b69d09d9f5..73709ba13643e 100644 --- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp +++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -34,69 +34,45 @@ using namespace llvm; // Register this pass... char AMDGPUAAWrapperPass::ID = 0; +char AMDGPUExternalAAWrapper::ID = 0; INITIALIZE_PASS(AMDGPUAAWrapperPass, "amdgpu-aa", "AMDGPU Address space based Alias Analysis", false, true) +INITIALIZE_PASS(AMDGPUExternalAAWrapper, "amdgpu-aa-wrapper", + "AMDGPU Address space based Alias Analysis Wrapper", false, true) + ImmutablePass *llvm::createAMDGPUAAWrapperPass() { return new AMDGPUAAWrapperPass(); } +ImmutablePass *llvm::createAMDGPUExternalAAWrapperPass() { + return new AMDGPUExternalAAWrapper(); +} + void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); } -// Must match the table in getAliasResult. -AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Arch_) - : Arch(Arch_), AS(AS_) { - // These arrarys are indexed by address space value - // enum elements 0 ... to 5 - static const AliasResult ASAliasRulesPrivIsZero[6][6] = { - /* Private Global Constant Group Flat Region*/ - /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , MayAlias, NoAlias}, - /* Global */ {NoAlias , MayAlias, NoAlias , NoAlias , MayAlias, NoAlias}, - /* Constant */ {NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, NoAlias}, - /* Group */ {NoAlias , NoAlias , NoAlias , MayAlias, MayAlias, NoAlias}, - /* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias}, - /* Region */ {NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, MayAlias} - }; - static const AliasResult ASAliasRulesGenIsZero[6][6] = { - /* Flat Global Region Group Constant Private */ - /* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias}, - /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , NoAlias , NoAlias}, - /* Constant */ {MayAlias, NoAlias , MayAlias, NoAlias , NoAlias, NoAlias}, - /* Group */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias}, - /* Region */ {MayAlias, NoAlias , NoAlias , NoAlias, MayAlias, NoAlias}, - /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias} - }; - assert(AS.MAX_COMMON_ADDRESS <= 5); - if (AS.FLAT_ADDRESS == 0) { - assert(AS.GLOBAL_ADDRESS == 1 && - AS.REGION_ADDRESS == 2 && - AS.LOCAL_ADDRESS == 3 && - AS.CONSTANT_ADDRESS == 4 && - AS.PRIVATE_ADDRESS == 5); - ASAliasRules = &ASAliasRulesGenIsZero; - } else { - assert(AS.PRIVATE_ADDRESS == 0 && - AS.GLOBAL_ADDRESS == 1 && - AS.CONSTANT_ADDRESS == 2 && - AS.LOCAL_ADDRESS == 3 && - AS.FLAT_ADDRESS == 4 && - AS.REGION_ADDRESS == 5); - ASAliasRules = &ASAliasRulesPrivIsZero; - } -} +// These arrays are indexed by address space value enum elements 0 ... to 6 +static const AliasResult ASAliasRules[7][7] = { + /* Flat Global Region Group Constant Private Constant 32-bit */ + /* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias}, + /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , MayAlias}, + /* Region */ {MayAlias, NoAlias , NoAlias , NoAlias, MayAlias, NoAlias , MayAlias}, + /* Group */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias , NoAlias}, + /* Constant */ {MayAlias, MayAlias, MayAlias, NoAlias , NoAlias, NoAlias , MayAlias}, + /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, NoAlias}, + /* Constant 32-bit */ {MayAlias, MayAlias, MayAlias, NoAlias , MayAlias, NoAlias , NoAlias} +}; -AliasResult AMDGPUAAResult::ASAliasRulesTy::getAliasResult(unsigned AS1, - unsigned AS2) const { - if (AS1 > AS.MAX_COMMON_ADDRESS || AS2 > AS.MAX_COMMON_ADDRESS) { - if (Arch == Triple::amdgcn) - report_fatal_error("Pointer address space out of range"); - return AS1 == AS2 ? MayAlias : NoAlias; - } +static AliasResult getAliasResult(unsigned AS1, unsigned AS2) { + static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 6, "Addr space out of range"); - return (*ASAliasRules)[AS1][AS2]; + if (AS1 > AMDGPUAS::MAX_AMDGPU_ADDRESS || AS2 > AMDGPUAS::MAX_AMDGPU_ADDRESS) + return MayAlias; + + return ASAliasRules[AS1][AS2]; } AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA, @@ -104,8 +80,9 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA, unsigned asA = LocA.Ptr->getType()->getPointerAddressSpace(); unsigned asB = LocB.Ptr->getType()->getPointerAddressSpace(); - AliasResult Result = ASAliasRules.getAliasResult(asA, asB); - if (Result == NoAlias) return Result; + AliasResult Result = getAliasResult(asA, asB); + if (Result == NoAlias) + return Result; // Forward the query to the next alias analysis. return AAResultBase::alias(LocA, LocB); @@ -114,9 +91,9 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA, bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal) { const Value *Base = GetUnderlyingObject(Loc.Ptr, DL); - - if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS || - Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS_32BIT) { + unsigned AS = Base->getType()->getPointerAddressSpace(); + if (AS == AMDGPUAS::CONSTANT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { return true; } diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h index 645a38af753ce..d76c9fc481995 100644 --- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h +++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h @@ -33,14 +33,12 @@ class AMDGPUAAResult : public AAResultBase<AMDGPUAAResult> { friend AAResultBase<AMDGPUAAResult>; const DataLayout &DL; - AMDGPUAS AS; public: explicit AMDGPUAAResult(const DataLayout &DL, Triple T) : AAResultBase(), - DL(DL), AS(AMDGPU::getAMDGPUAS(T)), ASAliasRules(AS, T.getArch()) {} + DL(DL) {} AMDGPUAAResult(AMDGPUAAResult &&Arg) - : AAResultBase(std::move(Arg)), DL(Arg.DL), AS(Arg.AS), - ASAliasRules(Arg.ASAliasRules){} + : AAResultBase(std::move(Arg)), DL(Arg.DL) {} /// Handle invalidation events from the new pass manager. /// @@ -53,18 +51,6 @@ public: private: bool Aliases(const MDNode *A, const MDNode *B) const; bool PathAliases(const MDNode *A, const MDNode *B) const; - - class ASAliasRulesTy { - public: - ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Arch_); - - AliasResult getAliasResult(unsigned AS1, unsigned AS2) const; - - private: - Triple::ArchType Arch; - AMDGPUAS AS; - const AliasResult (*ASAliasRules)[6][6]; - } ASAliasRules; }; /// Analysis pass providing a never-invalidated alias analysis result. @@ -110,6 +96,19 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override; }; +// Wrapper around ExternalAAWrapperPass so that the default constructor gets the +// callback. +class AMDGPUExternalAAWrapper : public ExternalAAWrapperPass { +public: + static char ID; + + AMDGPUExternalAAWrapper() : ExternalAAWrapperPass( + [](Pass &P, Function &, AAResults &AAR) { + if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) + AAR.addAAResult(WrapperPass->getResult()); + }) {} +}; + } // end namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUALIASANALYSIS_H diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index d4bbb2c1eb8d1..fc65430b745f3 100644 --- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -86,8 +86,6 @@ void AMDGPUAlwaysInline::recursivelyVisitUsers( } bool AMDGPUAlwaysInline::runOnModule(Module &M) { - AMDGPUAS AMDGPUAS = AMDGPU::getAMDGPUAS(M); - std::vector<GlobalAlias*> AliasesToRemove; SmallPtrSet<Function *, 8> FuncsToAlwaysInline; @@ -122,7 +120,7 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) { for (GlobalVariable &GV : M.globals()) { // TODO: Region address unsigned AS = GV.getType()->getAddressSpace(); - if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS.REGION_ADDRESS) + if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS) continue; recursivelyVisitUsers(GV, FuncsToAlwaysInline); diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index 1a70833a4472f..896ac9c87779e 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -46,7 +46,6 @@ namespace { class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { private: const TargetMachine *TM = nullptr; - AMDGPUAS AS; bool addFeatureAttributes(Function &F); @@ -67,11 +66,10 @@ public: CallGraphSCCPass::getAnalysisUsage(AU); } - static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS); + static bool visitConstantExpr(const ConstantExpr *CE); static bool visitConstantExprsRecursively( const Constant *EntryC, - SmallPtrSet<const Constant *, 8> &ConstantExprVisited, - AMDGPUAS AS); + SmallPtrSet<const Constant *, 8> &ConstantExprVisited); }; } // end anonymous namespace @@ -85,20 +83,18 @@ INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, // The queue ptr is only needed when casting to flat, not from it. -static bool castRequiresQueuePtr(unsigned SrcAS, const AMDGPUAS &AS) { - return SrcAS == AS.LOCAL_ADDRESS || SrcAS == AS.PRIVATE_ADDRESS; +static bool castRequiresQueuePtr(unsigned SrcAS) { + return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; } -static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC, - const AMDGPUAS &AS) { - return castRequiresQueuePtr(ASC->getSrcAddressSpace(), AS); +static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { + return castRequiresQueuePtr(ASC->getSrcAddressSpace()); } -bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE, - AMDGPUAS AS) { +bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { if (CE->getOpcode() == Instruction::AddrSpaceCast) { unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); - return castRequiresQueuePtr(SrcAS, AS); + return castRequiresQueuePtr(SrcAS); } return false; @@ -106,8 +102,7 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE, bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( const Constant *EntryC, - SmallPtrSet<const Constant *, 8> &ConstantExprVisited, - AMDGPUAS AS) { + SmallPtrSet<const Constant *, 8> &ConstantExprVisited) { if (!ConstantExprVisited.insert(EntryC).second) return false; @@ -120,7 +115,7 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( // Check this constant expression. if (const auto *CE = dyn_cast<ConstantExpr>(C)) { - if (visitConstantExpr(CE, AS)) + if (visitConstantExpr(CE)) return true; } @@ -262,7 +257,7 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { continue; if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { - if (castRequiresQueuePtr(ASC, AS)) { + if (castRequiresQueuePtr(ASC)) { NeedQueuePtr = true; continue; } @@ -273,7 +268,7 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { if (!OpC) continue; - if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) { + if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) { NeedQueuePtr = true; break; } @@ -318,7 +313,6 @@ bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { if (!TPC) report_fatal_error("TargetMachine is required"); - AS = AMDGPU::getAMDGPUAS(CG.getModule()); TM = &TPC->getTM<TargetMachine>(); return false; } diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index ed5370826647f..f88e3b0dac860 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -16,7 +16,7 @@ #include "AMDGPU.h" #include "AMDGPUIntrinsicInfo.h" #include "llvm/ADT/SetVector.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/IR/IRBuilder.h" @@ -32,12 +32,11 @@ namespace { class AMDGPUAnnotateUniformValues : public FunctionPass, public InstVisitor<AMDGPUAnnotateUniformValues> { - DivergenceAnalysis *DA; + LegacyDivergenceAnalysis *DA; MemoryDependenceResults *MDR; LoopInfo *LI; DenseMap<Value*, GetElementPtrInst*> noClobberClones; bool isKernelFunc; - AMDGPUAS AMDGPUASI; public: static char ID; @@ -49,7 +48,7 @@ public: return "AMDGPU Annotate Uniform Values"; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<DivergenceAnalysis>(); + AU.addRequired<LegacyDivergenceAnalysis>(); AU.addRequired<MemoryDependenceWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.setPreservesAll(); @@ -64,7 +63,7 @@ public: INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE, "Add AMDGPU uniform metadata", false, false) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, @@ -118,14 +117,8 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) { } void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) { - if (I.isUnconditional()) - return; - - Value *Cond = I.getCondition(); - if (!DA->isUniform(Cond)) - return; - - setUniformMetadata(I.getParent()->getTerminator()); + if (DA->isUniform(&I)) + setUniformMetadata(I.getParent()->getTerminator()); } void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { @@ -133,7 +126,7 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { if (!DA->isUniform(Ptr)) return; auto isGlobalLoad = [&](LoadInst &Load)->bool { - return Load.getPointerAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS; + return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }; // We're tracking up to the Function boundaries // We cannot go beyond because of FunctionPass restrictions @@ -168,7 +161,6 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { } bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) { - AMDGPUASI = AMDGPU::getAMDGPUAS(M); return false; } @@ -176,7 +168,7 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) { if (skipFunction(F)) return false; - DA = &getAnalysis<DivergenceAnalysis>(); + DA = &getAnalysis<LegacyDivergenceAnalysis>(); MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL; diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index e62e5d52ad74f..2ded7cdb64899 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -40,11 +40,13 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/Support/AMDGPUMetadata.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/TargetParser.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetLoweringObjectFile.h" using namespace llvm; using namespace llvm::AMDGPU; +using namespace llvm::AMDGPU::HSAMD; // TODO: This should get the default rounding mode from the kernel. We just set // the default here, but this could change if the OpenCL rounding mode pragmas @@ -98,8 +100,11 @@ extern "C" void LLVMInitializeAMDGPUAsmPrinter() { AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) : AsmPrinter(TM, std::move(Streamer)) { - AMDGPUASI = static_cast<AMDGPUTargetMachine*>(&TM)->getAMDGPUAS(); - } + if (IsaInfo::hasCodeObjectV3(getSTI())) + HSAMetadataStream.reset(new MetadataStreamerV3()); + else + HSAMetadataStream.reset(new MetadataStreamerV2()); +} StringRef AMDGPUAsmPrinter::getPassName() const { return "AMDGPU Assembly Printer"; @@ -116,62 +121,70 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { } void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { - if (IsaInfo::hasCodeObjectV3(getSTI()) && - TM.getTargetTriple().getOS() == Triple::AMDHSA) - return; + if (IsaInfo::hasCodeObjectV3(getSTI())) { + std::string ExpectedTarget; + raw_string_ostream ExpectedTargetOS(ExpectedTarget); + IsaInfo::streamIsaVersion(getSTI(), ExpectedTargetOS); + + getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget); + } if (TM.getTargetTriple().getOS() != Triple::AMDHSA && TM.getTargetTriple().getOS() != Triple::AMDPAL) return; if (TM.getTargetTriple().getOS() == Triple::AMDHSA) - HSAMetadataStream.begin(M); + HSAMetadataStream->begin(M); if (TM.getTargetTriple().getOS() == Triple::AMDPAL) readPALMetadata(M); + if (IsaInfo::hasCodeObjectV3(getSTI())) + return; + // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2. if (TM.getTargetTriple().getOS() == Triple::AMDHSA) getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1); // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2. - IsaInfo::IsaVersion ISA = IsaInfo::getIsaVersion(getSTI()->getFeatureBits()); + IsaVersion Version = getIsaVersion(getSTI()->getCPU()); getTargetStreamer()->EmitDirectiveHSACodeObjectISA( - ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU"); + Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU"); } void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { - // TODO: Add metadata to code object v3. - if (IsaInfo::hasCodeObjectV3(getSTI()) && - TM.getTargetTriple().getOS() == Triple::AMDHSA) - return; - // Following code requires TargetStreamer to be present. if (!getTargetStreamer()) return; - // Emit ISA Version (NT_AMD_AMDGPU_ISA). - std::string ISAVersionString; - raw_string_ostream ISAVersionStream(ISAVersionString); - IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream); - getTargetStreamer()->EmitISAVersion(ISAVersionStream.str()); + if (!IsaInfo::hasCodeObjectV3(getSTI())) { + // Emit ISA Version (NT_AMD_AMDGPU_ISA). + std::string ISAVersionString; + raw_string_ostream ISAVersionStream(ISAVersionString); + IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream); + getTargetStreamer()->EmitISAVersion(ISAVersionStream.str()); + } // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA). if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { - HSAMetadataStream.end(); - getTargetStreamer()->EmitHSAMetadata(HSAMetadataStream.getHSAMetadata()); + HSAMetadataStream->end(); + bool Success = HSAMetadataStream->emitTo(*getTargetStreamer()); + (void)Success; + assert(Success && "Malformed HSA Metadata"); } - // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA). - if (TM.getTargetTriple().getOS() == Triple::AMDPAL) { - // Copy the PAL metadata from the map where we collected it into a vector, - // then write it as a .note. - PALMD::Metadata PALMetadataVector; - for (auto i : PALMetadataMap) { - PALMetadataVector.push_back(i.first); - PALMetadataVector.push_back(i.second); + if (!IsaInfo::hasCodeObjectV3(getSTI())) { + // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA). + if (TM.getTargetTriple().getOS() == Triple::AMDPAL) { + // Copy the PAL metadata from the map where we collected it into a vector, + // then write it as a .note. + PALMD::Metadata PALMetadataVector; + for (auto i : PALMetadataMap) { + PALMetadataVector.push_back(i.first); + PALMetadataVector.push_back(i.second); + } + getTargetStreamer()->EmitPALMetadata(PALMetadataVector); } - getTargetStreamer()->EmitPALMetadata(PALMetadataVector); } } @@ -193,13 +206,10 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() { const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); if (!MFI.isEntryFunction()) return; - if (IsaInfo::hasCodeObjectV3(getSTI()) && - TM.getTargetTriple().getOS() == Triple::AMDHSA) - return; const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); const Function &F = MF->getFunction(); - if (STM.isAmdCodeObjectV2(F) && + if (!STM.hasCodeObjectV3() && STM.isAmdHsaOrMesa(F) && (F.getCallingConv() == CallingConv::AMDGPU_KERNEL || F.getCallingConv() == CallingConv::SPIR_KERNEL)) { amd_kernel_code_t KernelCode; @@ -207,10 +217,8 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() { getTargetStreamer()->EmitAMDKernelCodeT(KernelCode); } - if (TM.getTargetTriple().getOS() != Triple::AMDHSA) - return; - - HSAMetadataStream.emitKernel(*MF, CurrentProgramInfo); + if (STM.isAmdHsaOS()) + HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo); } void AMDGPUAsmPrinter::EmitFunctionBodyEnd() { @@ -241,7 +249,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() { *getSTI(), KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), CurrentProgramInfo.NumVGPRsForWavesPerEU, CurrentProgramInfo.NumSGPRsForWavesPerEU - - IsaInfo::getNumExtraSGPRs(getSTI()->getFeatureBits(), + IsaInfo::getNumExtraSGPRs(getSTI(), CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed), CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, @@ -259,7 +267,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>(); - if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(MF->getFunction())) { + if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) { SmallString<128> SymbolName; getNameWithPrefix(SymbolName, &MF->getFunction()), getTargetStreamer()->EmitAMDGPUSymbolType( @@ -562,7 +570,7 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs( const GCNSubtarget &ST) const { - return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(ST.getFeatureBits(), + return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch); } @@ -759,7 +767,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( // 48 SGPRs - vcc, - flat_scr, -xnack int MaxSGPRGuess = - 47 - IsaInfo::getNumExtraSGPRs(ST.getFeatureBits(), true, + 47 - IsaInfo::getNumExtraSGPRs(getSTI(), true, ST.hasFlatAddressSpace()); MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); MaxVGPR = std::max(MaxVGPR, 23); @@ -824,7 +832,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be // unified. unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs( - STM.getFeatureBits(), ProgInfo.VCCUsed, ProgInfo.FlatUsed); + getSTI(), ProgInfo.VCCUsed, ProgInfo.FlatUsed); // Check the addressable register limit before we add ExtraSGPRs. if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && @@ -906,9 +914,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, } ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks( - STM.getFeatureBits(), ProgInfo.NumSGPRsForWavesPerEU); + &STM, ProgInfo.NumSGPRsForWavesPerEU); ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks( - STM.getFeatureBits(), ProgInfo.NumVGPRsForWavesPerEU); + &STM, ProgInfo.NumVGPRsForWavesPerEU); // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue" @@ -1003,7 +1011,6 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) { void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) { - const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv()); @@ -1024,10 +1031,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer->EmitIntValue(RsrcReg, 4); OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4); - if (STM.isVGPRSpillingEnabled(MF.getFunction())) { - OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); - OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); - } + OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); + OutStreamer->EmitIntValue( + S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); } if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { @@ -1138,7 +1144,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); - AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits()); + AMDGPU::initDefaultAMDKernelCodeT(Out, getSTI()); Out.compute_pgm_resource_registers = CurrentProgramInfo.ComputePGMRSrc1 | diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 22982d912c708..167ac4b21e1e2 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -56,7 +56,7 @@ private: SIProgramInfo CurrentProgramInfo; DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo; - AMDGPU::HSAMD::MetadataStreamer HSAMetadataStream; + std::unique_ptr<AMDGPU::HSAMD::MetadataStreamer> HSAMetadataStream; std::map<uint32_t, uint32_t> PALMetadataMap; uint64_t getFunctionCodeSize(const MachineFunction &MF) const; @@ -143,7 +143,6 @@ public: protected: mutable std::vector<std::string> DisasmLines, HexLines; mutable size_t DisasmLineMaxLen; - AMDGPUAS AMDGPUASI; }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp new file mode 100644 index 0000000000000..644e4fd558bad --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -0,0 +1,458 @@ +//===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass optimizes atomic operations by using a single lane of a wavefront +/// to perform the atomic operation, thus reducing contention on that memory +/// location. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#define DEBUG_TYPE "amdgpu-atomic-optimizer" + +using namespace llvm; + +namespace { + +enum DPP_CTRL { + DPP_ROW_SR1 = 0x111, + DPP_ROW_SR2 = 0x112, + DPP_ROW_SR4 = 0x114, + DPP_ROW_SR8 = 0x118, + DPP_WF_SR1 = 0x138, + DPP_ROW_BCAST15 = 0x142, + DPP_ROW_BCAST31 = 0x143 +}; + +struct ReplacementInfo { + Instruction *I; + Instruction::BinaryOps Op; + unsigned ValIdx; + bool ValDivergent; +}; + +class AMDGPUAtomicOptimizer : public FunctionPass, + public InstVisitor<AMDGPUAtomicOptimizer> { +private: + SmallVector<ReplacementInfo, 8> ToReplace; + const LegacyDivergenceAnalysis *DA; + const DataLayout *DL; + DominatorTree *DT; + bool HasDPP; + bool IsPixelShader; + + void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op, + unsigned ValIdx, bool ValDivergent) const; + + void setConvergent(CallInst *const CI) const; + +public: + static char ID; + + AMDGPUAtomicOptimizer() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<LegacyDivergenceAnalysis>(); + AU.addRequired<TargetPassConfig>(); + } + + void visitAtomicRMWInst(AtomicRMWInst &I); + void visitIntrinsicInst(IntrinsicInst &I); +}; + +} // namespace + +char AMDGPUAtomicOptimizer::ID = 0; + +char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID; + +bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { + if (skipFunction(F)) { + return false; + } + + DA = &getAnalysis<LegacyDivergenceAnalysis>(); + DL = &F.getParent()->getDataLayout(); + DominatorTreeWrapperPass *const DTW = + getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DT = DTW ? &DTW->getDomTree() : nullptr; + const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); + const TargetMachine &TM = TPC.getTM<TargetMachine>(); + const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); + HasDPP = ST.hasDPP(); + IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; + + visit(F); + + const bool Changed = !ToReplace.empty(); + + for (ReplacementInfo &Info : ToReplace) { + optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent); + } + + ToReplace.clear(); + + return Changed; +} + +void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { + // Early exit for unhandled address space atomic instructions. + switch (I.getPointerAddressSpace()) { + default: + return; + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::LOCAL_ADDRESS: + break; + } + + Instruction::BinaryOps Op; + + switch (I.getOperation()) { + default: + return; + case AtomicRMWInst::Add: + Op = Instruction::Add; + break; + case AtomicRMWInst::Sub: + Op = Instruction::Sub; + break; + } + + const unsigned PtrIdx = 0; + const unsigned ValIdx = 1; + + // If the pointer operand is divergent, then each lane is doing an atomic + // operation on a different address, and we cannot optimize that. + if (DA->isDivergent(I.getOperand(PtrIdx))) { + return; + } + + const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx)); + + // If the value operand is divergent, each lane is contributing a different + // value to the atomic calculation. We can only optimize divergent values if + // we have DPP available on our subtarget, and the atomic operation is 32 + // bits. + if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) { + return; + } + + // If we get here, we can optimize the atomic using a single wavefront-wide + // atomic operation to do the calculation for the entire wavefront, so + // remember the instruction so we can come back to it. + const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; + + ToReplace.push_back(Info); +} + +void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { + Instruction::BinaryOps Op; + + switch (I.getIntrinsicID()) { + default: + return; + case Intrinsic::amdgcn_buffer_atomic_add: + case Intrinsic::amdgcn_struct_buffer_atomic_add: + case Intrinsic::amdgcn_raw_buffer_atomic_add: + Op = Instruction::Add; + break; + case Intrinsic::amdgcn_buffer_atomic_sub: + case Intrinsic::amdgcn_struct_buffer_atomic_sub: + case Intrinsic::amdgcn_raw_buffer_atomic_sub: + Op = Instruction::Sub; + break; + } + + const unsigned ValIdx = 0; + + const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx)); + + // If the value operand is divergent, each lane is contributing a different + // value to the atomic calculation. We can only optimize divergent values if + // we have DPP available on our subtarget, and the atomic operation is 32 + // bits. + if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) { + return; + } + + // If any of the other arguments to the intrinsic are divergent, we can't + // optimize the operation. + for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { + if (DA->isDivergent(I.getOperand(Idx))) { + return; + } + } + + // If we get here, we can optimize the atomic using a single wavefront-wide + // atomic operation to do the calculation for the entire wavefront, so + // remember the instruction so we can come back to it. + const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; + + ToReplace.push_back(Info); +} + +void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, + Instruction::BinaryOps Op, + unsigned ValIdx, + bool ValDivergent) const { + LLVMContext &Context = I.getContext(); + + // Start building just before the instruction. + IRBuilder<> B(&I); + + // If we are in a pixel shader, because of how we have to mask out helper + // lane invocations, we need to record the entry and exit BB's. + BasicBlock *PixelEntryBB = nullptr; + BasicBlock *PixelExitBB = nullptr; + + // If we're optimizing an atomic within a pixel shader, we need to wrap the + // entire atomic operation in a helper-lane check. We do not want any helper + // lanes that are around only for the purposes of derivatives to take part + // in any cross-lane communication, and we use a branch on whether the lane is + // live to do this. + if (IsPixelShader) { + // Record I's original position as the entry block. + PixelEntryBB = I.getParent(); + + Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {}); + Instruction *const NonHelperTerminator = + SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); + + // Record I's new position as the exit block. + PixelExitBB = I.getParent(); + + I.moveBefore(NonHelperTerminator); + B.SetInsertPoint(&I); + } + + Type *const Ty = I.getType(); + const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); + Type *const VecTy = VectorType::get(B.getInt32Ty(), 2); + + // This is the value in the atomic operation we need to combine in order to + // reduce the number of atomic operations. + Value *const V = I.getOperand(ValIdx); + + // We need to know how many lanes are active within the wavefront, and we do + // this by getting the exec register, which tells us all the lanes that are + // active. + MDNode *const RegName = + llvm::MDNode::get(Context, llvm::MDString::get(Context, "exec")); + Value *const Metadata = llvm::MetadataAsValue::get(Context, RegName); + CallInst *const Exec = + B.CreateIntrinsic(Intrinsic::read_register, {B.getInt64Ty()}, {Metadata}); + setConvergent(Exec); + + // We need to know how many lanes are active within the wavefront that are + // below us. If we counted each lane linearly starting from 0, a lane is + // below us only if its associated index was less than ours. We do this by + // using the mbcnt intrinsic. + Value *const BitCast = B.CreateBitCast(Exec, VecTy); + Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); + Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); + CallInst *const PartialMbcnt = B.CreateIntrinsic( + Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)}); + CallInst *const Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, + {ExtractHi, PartialMbcnt}); + + Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false); + + Value *LaneOffset = nullptr; + Value *NewV = nullptr; + + // If we have a divergent value in each lane, we need to combine the value + // using DPP. + if (ValDivergent) { + // First we need to set all inactive invocations to 0, so that they can + // correctly contribute to the final result. + CallInst *const SetInactive = B.CreateIntrinsic( + Intrinsic::amdgcn_set_inactive, Ty, {V, B.getIntN(TyBitWidth, 0)}); + setConvergent(SetInactive); + NewV = SetInactive; + + const unsigned Iters = 6; + const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2, + DPP_ROW_SR4, DPP_ROW_SR8, + DPP_ROW_BCAST15, DPP_ROW_BCAST31}; + const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc}; + + // This loop performs an inclusive scan across the wavefront, with all lanes + // active (by using the WWM intrinsic). + for (unsigned Idx = 0; Idx < Iters; Idx++) { + CallInst *const DPP = B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, Ty, + {NewV, B.getInt32(DPPCtrl[Idx]), + B.getInt32(RowMask[Idx]), + B.getInt32(0xf), B.getFalse()}); + setConvergent(DPP); + Value *const WWM = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP); + + NewV = B.CreateBinOp(Op, NewV, WWM); + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); + } + + // NewV has returned the inclusive scan of V, but for the lane offset we + // require an exclusive scan. We do this by shifting the values from the + // entire wavefront right by 1, and by setting the bound_ctrl (last argument + // to the intrinsic below) to true, we can guarantee that 0 will be shifted + // into the 0'th invocation. + CallInst *const DPP = + B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, {Ty}, + {NewV, B.getInt32(DPP_WF_SR1), B.getInt32(0xf), + B.getInt32(0xf), B.getTrue()}); + setConvergent(DPP); + LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP); + + // Read the value from the last lane, which has accumlated the values of + // each active lane in the wavefront. This will be our new value with which + // we will provide to the atomic operation. + if (TyBitWidth == 64) { + Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty()); + Value *const ExtractHi = + B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty()); + CallInst *const ReadLaneLo = B.CreateIntrinsic( + Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)}); + setConvergent(ReadLaneLo); + CallInst *const ReadLaneHi = B.CreateIntrinsic( + Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)}); + setConvergent(ReadLaneHi); + Value *const PartialInsert = B.CreateInsertElement( + UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0)); + Value *const Insert = + B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1)); + NewV = B.CreateBitCast(Insert, Ty); + } else if (TyBitWidth == 32) { + CallInst *const ReadLane = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, + {}, {NewV, B.getInt32(63)}); + setConvergent(ReadLane); + NewV = ReadLane; + } else { + llvm_unreachable("Unhandled atomic bit width"); + } + } else { + // Get the total number of active lanes we have by using popcount. + Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Exec); + Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false); + + // Calculate the new value we will be contributing to the atomic operation + // for the entire wavefront. + NewV = B.CreateMul(V, CtpopCast); + LaneOffset = B.CreateMul(V, MbcntCast); + } + + // We only want a single lane to enter our new control flow, and we do this + // by checking if there are any active lanes below us. Only one lane will + // have 0 active lanes below us, so that will be the only one to progress. + Value *const Cond = B.CreateICmpEQ(MbcntCast, B.getIntN(TyBitWidth, 0)); + + // Store I's original basic block before we split the block. + BasicBlock *const EntryBB = I.getParent(); + + // We need to introduce some new control flow to force a single lane to be + // active. We do this by splitting I's basic block at I, and introducing the + // new block such that: + // entry --> single_lane -\ + // \------------------> exit + Instruction *const SingleLaneTerminator = + SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); + + // Move the IR builder into single_lane next. + B.SetInsertPoint(SingleLaneTerminator); + + // Clone the original atomic operation into single lane, replacing the + // original value with our newly created one. + Instruction *const NewI = I.clone(); + B.Insert(NewI); + NewI->setOperand(ValIdx, NewV); + + // Move the IR builder into exit next, and start inserting just before the + // original instruction. + B.SetInsertPoint(&I); + + // Create a PHI node to get our new atomic result into the exit block. + PHINode *const PHI = B.CreatePHI(Ty, 2); + PHI->addIncoming(UndefValue::get(Ty), EntryBB); + PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); + + // We need to broadcast the value who was the lowest active lane (the first + // lane) to all other lanes in the wavefront. We use an intrinsic for this, + // but have to handle 64-bit broadcasts with two calls to this intrinsic. + Value *BroadcastI = nullptr; + + if (TyBitWidth == 64) { + Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); + Value *const ExtractHi = + B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty()); + CallInst *const ReadFirstLaneLo = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); + setConvergent(ReadFirstLaneLo); + CallInst *const ReadFirstLaneHi = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); + setConvergent(ReadFirstLaneHi); + Value *const PartialInsert = B.CreateInsertElement( + UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); + Value *const Insert = + B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); + BroadcastI = B.CreateBitCast(Insert, Ty); + } else if (TyBitWidth == 32) { + CallInst *const ReadFirstLane = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); + setConvergent(ReadFirstLane); + BroadcastI = ReadFirstLane; + } else { + llvm_unreachable("Unhandled atomic bit width"); + } + + // Now that we have the result of our single atomic operation, we need to + // get our individual lane's slice into the result. We use the lane offset we + // previously calculated combined with the atomic result value we got from the + // first lane, to get our lane's index into the atomic result. + Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset); + + if (IsPixelShader) { + // Need a final PHI to reconverge to above the helper lane branch mask. + B.SetInsertPoint(PixelExitBB->getFirstNonPHI()); + + PHINode *const PHI = B.CreatePHI(Ty, 2); + PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB); + PHI->addIncoming(Result, I.getParent()); + I.replaceAllUsesWith(PHI); + } else { + // Replace the original atomic instruction with the new one. + I.replaceAllUsesWith(Result); + } + + // And delete the original. + I.eraseFromParent(); +} + +void AMDGPUAtomicOptimizer::setConvergent(CallInst *const CI) const { + CI->addAttribute(AttributeList::FunctionIndex, Attribute::Convergent); +} + +INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, + "AMDGPU atomic optimizations", false, false) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, + "AMDGPU atomic optimizations", false, false) + +FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() { + return new AMDGPUAtomicOptimizer(); +} diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 18c7df0d94f21..daef37f9c21fb 100644 --- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -28,11 +28,12 @@ using namespace llvm; AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) - : CallLowering(&TLI), AMDGPUASI(TLI.getAMDGPUAS()) { + : CallLowering(&TLI) { } bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, - const Value *Val, unsigned VReg) const { + const Value *Val, + ArrayRef<unsigned> VRegs) const { // FIXME: Add support for non-void returns. if (Val) return false; @@ -50,7 +51,7 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = MF.getFunction(); const DataLayout &DL = F.getParent()->getDataLayout(); - PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS); + PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); LLT PtrType = getLLTForType(*PtrTy, DL); unsigned DstReg = MRI.createGenericVirtualRegister(PtrType); unsigned KernArgSegmentPtr = @@ -72,7 +73,7 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder, MachineFunction &MF = MIRBuilder.getMF(); const Function &F = MF.getFunction(); const DataLayout &DL = F.getParent()->getDataLayout(); - PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS); + PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); unsigned TypeSize = DL.getTypeStoreSize(ParamTy); unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset); diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h index f51cb6abbf65c..ed859716218ef 100644 --- a/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -23,8 +23,6 @@ namespace llvm { class AMDGPUTargetLowering; class AMDGPUCallLowering: public CallLowering { - AMDGPUAS AMDGPUASI; - unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy, uint64_t Offset) const; @@ -35,8 +33,8 @@ class AMDGPUCallLowering: public CallLowering { public: AMDGPUCallLowering(const AMDGPUTargetLowering &TLI); - bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val, - unsigned VReg) const override; + bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, + ArrayRef<unsigned> VRegs) const override; bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<unsigned> VRegs) const override; static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg); diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td index 68bc7fdd99618..367f120b5fa6b 100644 --- a/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -19,7 +19,7 @@ class CCIfExtend<CCAction A> // Calling convention for SI def CC_SI : CallingConv<[ - CCIfInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[ + CCIfInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, @@ -33,7 +33,7 @@ def CC_SI : CallingConv<[ CCIfByVal<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>, // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs. - CCIfNotInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[ + CCIfNotInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, @@ -64,7 +64,7 @@ def RetCC_SI_Shader : CallingConv<[ ]>>, // 32*4 + 4 is the minimum for a fetch shader with 32 outputs. - CCIfType<[f32, f16] , CCAssignToReg<[ + CCIfType<[f32, f16, v2f16] , CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 5713b7b7f9a84..4dc1e67c573d3 100644 --- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -18,7 +18,7 @@ #include "AMDGPUTargetMachine.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AssumptionCache.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/Passes.h" @@ -60,10 +60,9 @@ class AMDGPUCodeGenPrepare : public FunctionPass, public InstVisitor<AMDGPUCodeGenPrepare, bool> { const GCNSubtarget *ST = nullptr; AssumptionCache *AC = nullptr; - DivergenceAnalysis *DA = nullptr; + LegacyDivergenceAnalysis *DA = nullptr; Module *Mod = nullptr; bool HasUnsafeFPMath = false; - AMDGPUAS AMDGPUASI; /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to /// binary operation \p V. @@ -177,7 +176,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<DivergenceAnalysis>(); + AU.addRequired<LegacyDivergenceAnalysis>(); AU.setPreservesAll(); } }; @@ -559,7 +558,7 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, Value *FQM = Builder.CreateFMul(FA, RCP); // fq = trunc(fqm); - CallInst* FQ = Builder.CreateIntrinsic(Intrinsic::trunc, { FQM }); + CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM); FQ->copyFastMathFlags(Builder.getFastMathFlags()); // float fqneg = -fq; @@ -567,17 +566,17 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, // float fr = mad(fqneg, fb, fa); Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz, - { FQNeg, FB, FA }, FQ); + {FQNeg->getType()}, {FQNeg, FB, FA}, FQ); // int iq = (int)fq; Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty) : Builder.CreateFPToUI(FQ, I32Ty); // fr = fabs(fr); - FR = Builder.CreateIntrinsic(Intrinsic::fabs, { FR }, FQ); + FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ); // fb = fabs(fb); - FB = Builder.CreateIntrinsic(Intrinsic::fabs, { FB }, FQ); + FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ); // int cv = fr >= fb; Value *CV = Builder.CreateFCmpOGE(FR, FB); @@ -799,8 +798,8 @@ bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { if (!WidenLoads) return false; - if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || - I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && + if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || + I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && canWidenScalarExtLoad(I)) { IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); @@ -898,9 +897,8 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>(); ST = &TM.getSubtarget<GCNSubtarget>(F); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - DA = &getAnalysis<DivergenceAnalysis>(); + DA = &getAnalysis<LegacyDivergenceAnalysis>(); HasUnsafeFPMath = hasUnsafeFPMath(F); - AMDGPUASI = TM.getAMDGPUAS(); bool MadeChange = false; @@ -918,7 +916,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) diff --git a/lib/Target/AMDGPU/AMDGPUFeatures.td b/lib/Target/AMDGPU/AMDGPUFeatures.td index b375cae9018ea..3c7d8a8fc5509 100644 --- a/lib/Target/AMDGPU/AMDGPUFeatures.td +++ b/lib/Target/AMDGPU/AMDGPUFeatures.td @@ -19,6 +19,15 @@ def FeatureFMA : SubtargetFeature<"fmaf", "Enable single precision FMA (not as fast as mul+add, but fused)" >; +// Some instructions do not support denormals despite this flag. Using +// fp32 denormals also causes instructions to run at the double +// precision rate for the device. +def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals", + "FP32Denormals", + "true", + "Enable single precision denormal handling" +>; + class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature< "localmemorysize"#Value, "LocalMemorySize", diff --git a/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp new file mode 100644 index 0000000000000..6e2a981d33968 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp @@ -0,0 +1,63 @@ +//===-- AMDGPUFixFunctionBitcasts.cpp - Fix function bitcasts -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// Promote indirect (bitcast) calls to direct calls when they are statically +/// known to be direct. Required when InstCombine is not run (e.g. at OptNone) +/// because AMDGPU does not support indirect calls. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Transforms/Utils/CallPromotionUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-fix-function-bitcasts" + +namespace { +class AMDGPUFixFunctionBitcasts final + : public ModulePass, + public InstVisitor<AMDGPUFixFunctionBitcasts> { + + bool runOnModule(Module &M) override; + + bool Modified; + +public: + void visitCallSite(CallSite CS) { + if (CS.getCalledFunction()) + return; + auto Callee = dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts()); + if (Callee && isLegalToPromote(CS, Callee)) { + promoteCall(CS, Callee); + Modified = true; + } + } + + static char ID; + AMDGPUFixFunctionBitcasts() : ModulePass(ID) {} +}; +} // End anonymous namespace + +char AMDGPUFixFunctionBitcasts::ID = 0; +char &llvm::AMDGPUFixFunctionBitcastsID = AMDGPUFixFunctionBitcasts::ID; +INITIALIZE_PASS(AMDGPUFixFunctionBitcasts, DEBUG_TYPE, + "Fix function bitcasts for AMDGPU", false, false) + +ModulePass *llvm::createAMDGPUFixFunctionBitcastsPass() { + return new AMDGPUFixFunctionBitcasts(); +} + +bool AMDGPUFixFunctionBitcasts::runOnModule(Module &M) { + Modified = false; + visit(M); + return Modified; +} diff --git a/lib/Target/AMDGPU/AMDGPUGISel.td b/lib/Target/AMDGPU/AMDGPUGISel.td index ba735390f6791..59bb2a16e0f34 100644 --- a/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/lib/Target/AMDGPU/AMDGPUGISel.td @@ -122,15 +122,14 @@ def : GISelVop2CommutePat <sra, V_ASHRREV_I32_e32, i32>; } def : GISelVop3Pat2CommutePat <sra, V_ASHRREV_I32_e64, i32>; -// FIXME: Select directly to _e32 so we don't need to deal with modifiers. // FIXME: We can't re-use SelectionDAG patterns here because they match // against a custom SDNode and we would need to create a generic machine // instruction that is equivalent to the custom SDNode. This would also require // us to custom legalize the intrinsic to the new generic machine instruction, // but I can't get custom legalizing of intrinsic to work and I'm not sure if // this is even supported yet. -defm : GISelVop2IntrPat < - int_amdgcn_cvt_pkrtz, V_CVT_PKRTZ_F16_F32_e32, v2f16, f32>; +def : GISelVop3Pat2ModsPat < + int_amdgcn_cvt_pkrtz, V_CVT_PKRTZ_F16_F32_e64, v2f16, f32>; defm : GISelVop2IntrPat <int_maxnum, V_MAX_F32_e32, f32>; def : GISelVop3Pat2ModsPat <int_maxnum, V_MAX_F64, f64>; diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def index 3a58c6c6a29fe..6eab59ab4e09b 100644 --- a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def +++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def @@ -16,34 +16,38 @@ namespace AMDGPU { enum PartialMappingIdx { None = - 1, - PM_SGPR1 = 0, - PM_SGPR16 = 4, - PM_SGPR32 = 5, - PM_SGPR64 = 6, - PM_SGPR128 = 7, - PM_SGPR256 = 8, - PM_SGPR512 = 9, - PM_VGPR1 = 10, - PM_VGPR16 = 14, - PM_VGPR32 = 15, - PM_VGPR64 = 16, - PM_VGPR128 = 17, - PM_VGPR256 = 18, - PM_VGPR512 = 19, - PM_SGPR96 = 20, - PM_VGPR96 = 21 + PM_SGPR1 = 2, + PM_SGPR16 = 6, + PM_SGPR32 = 7, + PM_SGPR64 = 8, + PM_SGPR128 = 9, + PM_SGPR256 = 10, + PM_SGPR512 = 11, + PM_VGPR1 = 12, + PM_VGPR16 = 16, + PM_VGPR32 = 17, + PM_VGPR64 = 18, + PM_VGPR128 = 19, + PM_VGPR256 = 20, + PM_VGPR512 = 21, + PM_SGPR96 = 22, + PM_VGPR96 = 23 }; const RegisterBankInfo::PartialMapping PartMappings[] { // StartIdx, Length, RegBank {0, 1, SCCRegBank}, + {0, 1, VCCRegBank}, + + {0, 1, SGPRRegBank}, // SGPR begin {0, 16, SGPRRegBank}, {0, 32, SGPRRegBank}, {0, 64, SGPRRegBank}, {0, 128, SGPRRegBank}, {0, 256, SGPRRegBank}, {0, 512, SGPRRegBank}, - {0, 1, SGPRRegBank}, + + {0, 1, VGPRRegBank}, // VGPR begin {0, 16, VGPRRegBank}, {0, 32, VGPRRegBank}, {0, 64, VGPRRegBank}, @@ -55,33 +59,43 @@ const RegisterBankInfo::PartialMapping PartMappings[] { }; const RegisterBankInfo::ValueMapping ValMappings[] { + // SCC {&PartMappings[0], 1}, - {nullptr, 0}, - {nullptr, 0}, - {nullptr, 0}, + + // VCC {&PartMappings[1], 1}, + + // SGPRs {&PartMappings[2], 1}, + {nullptr, 0}, // Illegal power of 2 sizes + {nullptr, 0}, + {nullptr, 0}, {&PartMappings[3], 1}, {&PartMappings[4], 1}, {&PartMappings[5], 1}, {&PartMappings[6], 1}, {&PartMappings[7], 1}, + {&PartMappings[8], 1}, + + // VGPRs + {&PartMappings[9], 1}, {nullptr, 0}, {nullptr, 0}, {nullptr, 0}, - {&PartMappings[8], 1}, - {&PartMappings[9], 1}, {&PartMappings[10], 1}, {&PartMappings[11], 1}, {&PartMappings[12], 1}, {&PartMappings[13], 1}, {&PartMappings[14], 1}, - {&PartMappings[15], 1} + {&PartMappings[15], 1}, + {&PartMappings[16], 1}, + {&PartMappings[17], 1} }; enum ValueMappingIdx { - SGPRStartIdx = 0, - VGPRStartIdx = 10 + SCCStartIdx = 0, + SGPRStartIdx = 2, + VGPRStartIdx = 12 }; const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID, @@ -89,16 +103,28 @@ const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID, unsigned Idx; switch (Size) { case 1: - Idx = BankID == AMDGPU::SCCRegBankID ? PM_SGPR1 : PM_VGPR1; + if (BankID == AMDGPU::SCCRegBankID) + return &ValMappings[0]; + if (BankID == AMDGPU::VCCRegBankID) + return &ValMappings[1]; + + // 1-bit values not from a compare etc. + Idx = BankID == AMDGPU::SGPRRegBankID ? PM_SGPR1 : PM_VGPR1; break; case 96: + assert(BankID != AMDGPU::VCCRegBankID); Idx = BankID == AMDGPU::SGPRRegBankID ? PM_SGPR96 : PM_VGPR96; break; default: + assert(BankID != AMDGPU::VCCRegBankID); Idx = BankID == AMDGPU::VGPRRegBankID ? VGPRStartIdx : SGPRStartIdx; Idx += Log2_32_Ceil(Size); break; } + + assert(Log2_32_Ceil(Size) == Log2_32_Ceil(ValMappings[Idx].BreakDown->Length)); + assert(BankID == ValMappings[Idx].BreakDown->RegBank->getID()); + return &ValMappings[Idx]; } diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index 01ef346f74ee8..c38b0e61558b3 100644 --- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -16,6 +16,7 @@ #include "AMDGPUHSAMetadataStreamer.h" #include "AMDGPU.h" #include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUTargetStreamer.h" #include "SIMachineFunctionInfo.h" #include "SIProgramInfo.h" #include "Utils/AMDGPUBaseInfo.h" @@ -36,11 +37,14 @@ static cl::opt<bool> VerifyHSAMetadata( namespace AMDGPU { namespace HSAMD { -void MetadataStreamer::dump(StringRef HSAMetadataString) const { +//===----------------------------------------------------------------------===// +// HSAMetadataStreamerV2 +//===----------------------------------------------------------------------===// +void MetadataStreamerV2::dump(StringRef HSAMetadataString) const { errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n'; } -void MetadataStreamer::verify(StringRef HSAMetadataString) const { +void MetadataStreamerV2::verify(StringRef HSAMetadataString) const { errs() << "AMDGPU HSA Metadata Parser Test: "; HSAMD::Metadata FromHSAMetadataString; @@ -63,7 +67,8 @@ void MetadataStreamer::verify(StringRef HSAMetadataString) const { } } -AccessQualifier MetadataStreamer::getAccessQualifier(StringRef AccQual) const { +AccessQualifier +MetadataStreamerV2::getAccessQualifier(StringRef AccQual) const { if (AccQual.empty()) return AccessQualifier::Unknown; @@ -74,26 +79,29 @@ AccessQualifier MetadataStreamer::getAccessQualifier(StringRef AccQual) const { .Default(AccessQualifier::Default); } -AddressSpaceQualifier MetadataStreamer::getAddressSpaceQualifer( +AddressSpaceQualifier +MetadataStreamerV2::getAddressSpaceQualifier( unsigned AddressSpace) const { - if (AddressSpace == AMDGPUASI.PRIVATE_ADDRESS) + switch (AddressSpace) { + case AMDGPUAS::PRIVATE_ADDRESS: return AddressSpaceQualifier::Private; - if (AddressSpace == AMDGPUASI.GLOBAL_ADDRESS) + case AMDGPUAS::GLOBAL_ADDRESS: return AddressSpaceQualifier::Global; - if (AddressSpace == AMDGPUASI.CONSTANT_ADDRESS) + case AMDGPUAS::CONSTANT_ADDRESS: return AddressSpaceQualifier::Constant; - if (AddressSpace == AMDGPUASI.LOCAL_ADDRESS) + case AMDGPUAS::LOCAL_ADDRESS: return AddressSpaceQualifier::Local; - if (AddressSpace == AMDGPUASI.FLAT_ADDRESS) + case AMDGPUAS::FLAT_ADDRESS: return AddressSpaceQualifier::Generic; - if (AddressSpace == AMDGPUASI.REGION_ADDRESS) + case AMDGPUAS::REGION_ADDRESS: return AddressSpaceQualifier::Region; - - llvm_unreachable("Unknown address space qualifier"); + default: + return AddressSpaceQualifier::Unknown; + } } -ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual, - StringRef BaseTypeName) const { +ValueKind MetadataStreamerV2::getValueKind(Type *Ty, StringRef TypeQual, + StringRef BaseTypeName) const { if (TypeQual.find("pipe") != StringRef::npos) return ValueKind::Pipe; @@ -114,13 +122,13 @@ ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual, .Case("queue_t", ValueKind::Queue) .Default(isa<PointerType>(Ty) ? (Ty->getPointerAddressSpace() == - AMDGPUASI.LOCAL_ADDRESS ? + AMDGPUAS::LOCAL_ADDRESS ? ValueKind::DynamicSharedPointer : ValueKind::GlobalBuffer) : ValueKind::ByValue); } -ValueType MetadataStreamer::getValueType(Type *Ty, StringRef TypeName) const { +ValueType MetadataStreamerV2::getValueType(Type *Ty, StringRef TypeName) const { switch (Ty->getTypeID()) { case Type::IntegerTyID: { auto Signed = !TypeName.startswith("u"); @@ -152,7 +160,7 @@ ValueType MetadataStreamer::getValueType(Type *Ty, StringRef TypeName) const { } } -std::string MetadataStreamer::getTypeName(Type *Ty, bool Signed) const { +std::string MetadataStreamerV2::getTypeName(Type *Ty, bool Signed) const { switch (Ty->getTypeID()) { case Type::IntegerTyID: { if (!Signed) @@ -189,8 +197,8 @@ std::string MetadataStreamer::getTypeName(Type *Ty, bool Signed) const { } } -std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions( - MDNode *Node) const { +std::vector<uint32_t> +MetadataStreamerV2::getWorkGroupDimensions(MDNode *Node) const { std::vector<uint32_t> Dims; if (Node->getNumOperands() != 3) return Dims; @@ -200,9 +208,9 @@ std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions( return Dims; } -Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps( - const MachineFunction &MF, - const SIProgramInfo &ProgramInfo) const { +Kernel::CodeProps::Metadata +MetadataStreamerV2::getHSACodeProps(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const { const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); HSAMD::Kernel::CodeProps::Metadata HSACodeProps; @@ -229,9 +237,9 @@ Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps( return HSACodeProps; } -Kernel::DebugProps::Metadata MetadataStreamer::getHSADebugProps( - const MachineFunction &MF, - const SIProgramInfo &ProgramInfo) const { +Kernel::DebugProps::Metadata +MetadataStreamerV2::getHSADebugProps(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const { const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); HSAMD::Kernel::DebugProps::Metadata HSADebugProps; @@ -251,14 +259,14 @@ Kernel::DebugProps::Metadata MetadataStreamer::getHSADebugProps( return HSADebugProps; } -void MetadataStreamer::emitVersion() { +void MetadataStreamerV2::emitVersion() { auto &Version = HSAMetadata.mVersion; Version.push_back(VersionMajor); Version.push_back(VersionMinor); } -void MetadataStreamer::emitPrintf(const Module &Mod) { +void MetadataStreamerV2::emitPrintf(const Module &Mod) { auto &Printf = HSAMetadata.mPrintf; auto Node = Mod.getNamedMetadata("llvm.printf.fmts"); @@ -270,7 +278,7 @@ void MetadataStreamer::emitPrintf(const Module &Mod) { Printf.push_back(cast<MDString>(Op->getOperand(0))->getString()); } -void MetadataStreamer::emitKernelLanguage(const Function &Func) { +void MetadataStreamerV2::emitKernelLanguage(const Function &Func) { auto &Kernel = HSAMetadata.mKernels.back(); // TODO: What about other languages? @@ -288,7 +296,7 @@ void MetadataStreamer::emitKernelLanguage(const Function &Func) { mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue()); } -void MetadataStreamer::emitKernelAttrs(const Function &Func) { +void MetadataStreamerV2::emitKernelAttrs(const Function &Func) { auto &Attrs = HSAMetadata.mKernels.back().mAttrs; if (auto Node = Func.getMetadata("reqd_work_group_size")) @@ -306,14 +314,14 @@ void MetadataStreamer::emitKernelAttrs(const Function &Func) { } } -void MetadataStreamer::emitKernelArgs(const Function &Func) { +void MetadataStreamerV2::emitKernelArgs(const Function &Func) { for (auto &Arg : Func.args()) emitKernelArg(Arg); emitHiddenKernelArgs(Func); } -void MetadataStreamer::emitKernelArg(const Argument &Arg) { +void MetadataStreamerV2::emitKernelArg(const Argument &Arg) { auto Func = Arg.getParent(); auto ArgNo = Arg.getArgNo(); const MDNode *Node; @@ -355,7 +363,7 @@ void MetadataStreamer::emitKernelArg(const Argument &Arg) { unsigned PointeeAlign = 0; if (auto PtrTy = dyn_cast<PointerType>(Ty)) { - if (PtrTy->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) { + if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { PointeeAlign = Arg.getParamAlignment(); if (PointeeAlign == 0) PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType()); @@ -366,12 +374,12 @@ void MetadataStreamer::emitKernelArg(const Argument &Arg) { PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual); } -void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty, - ValueKind ValueKind, - unsigned PointeeAlign, - StringRef Name, - StringRef TypeName, StringRef BaseTypeName, - StringRef AccQual, StringRef TypeQual) { +void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty, + ValueKind ValueKind, + unsigned PointeeAlign, StringRef Name, + StringRef TypeName, + StringRef BaseTypeName, + StringRef AccQual, StringRef TypeQual) { HSAMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata()); auto &Arg = HSAMetadata.mKernels.back().mArgs.back(); @@ -384,7 +392,7 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty, Arg.mPointeeAlign = PointeeAlign; if (auto PtrTy = dyn_cast<PointerType>(Ty)) - Arg.mAddrSpaceQual = getAddressSpaceQualifer(PtrTy->getAddressSpace()); + Arg.mAddrSpaceQual = getAddressSpaceQualifier(PtrTy->getAddressSpace()); Arg.mAccQual = getAccessQualifier(AccQual); @@ -404,7 +412,7 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty, } } -void MetadataStreamer::emitHiddenKernelArgs(const Function &Func) { +void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) { int HiddenArgNumBytes = getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0); @@ -422,7 +430,7 @@ void MetadataStreamer::emitHiddenKernelArgs(const Function &Func) { emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ); auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(), - AMDGPUASI.GLOBAL_ADDRESS); + AMDGPUAS::GLOBAL_ADDRESS); // Emit "printf buffer" argument if printf is used, otherwise emit dummy // "none" argument. @@ -446,13 +454,16 @@ void MetadataStreamer::emitHiddenKernelArgs(const Function &Func) { } } -void MetadataStreamer::begin(const Module &Mod) { - AMDGPUASI = getAMDGPUAS(Mod); +bool MetadataStreamerV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) { + return TargetStreamer.EmitHSAMetadata(getHSAMetadata()); +} + +void MetadataStreamerV2::begin(const Module &Mod) { emitVersion(); emitPrintf(Mod); } -void MetadataStreamer::end() { +void MetadataStreamerV2::end() { std::string HSAMetadataString; if (toString(HSAMetadata, HSAMetadataString)) return; @@ -463,7 +474,8 @@ void MetadataStreamer::end() { verify(HSAMetadataString); } -void MetadataStreamer::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) { +void MetadataStreamerV2::emitKernel(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) { auto &Func = MF.getFunction(); if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL) return; @@ -483,6 +495,505 @@ void MetadataStreamer::emitKernel(const MachineFunction &MF, const SIProgramInfo HSAMetadata.mKernels.back().mDebugProps = DebugProps; } +//===----------------------------------------------------------------------===// +// HSAMetadataStreamerV3 +//===----------------------------------------------------------------------===// + +void MetadataStreamerV3::dump(StringRef HSAMetadataString) const { + errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n'; +} + +void MetadataStreamerV3::verify(StringRef HSAMetadataString) const { + errs() << "AMDGPU HSA Metadata Parser Test: "; + + std::shared_ptr<msgpack::Node> FromHSAMetadataString = + std::make_shared<msgpack::MapNode>(); + + yaml::Input YIn(HSAMetadataString); + YIn >> FromHSAMetadataString; + if (YIn.error()) { + errs() << "FAIL\n"; + return; + } + + std::string ToHSAMetadataString; + raw_string_ostream StrOS(ToHSAMetadataString); + yaml::Output YOut(StrOS); + YOut << FromHSAMetadataString; + + errs() << (HSAMetadataString == StrOS.str() ? "PASS" : "FAIL") << '\n'; + if (HSAMetadataString != ToHSAMetadataString) { + errs() << "Original input: " << HSAMetadataString << '\n' + << "Produced output: " << StrOS.str() << '\n'; + } +} + +Optional<StringRef> +MetadataStreamerV3::getAccessQualifier(StringRef AccQual) const { + return StringSwitch<Optional<StringRef>>(AccQual) + .Case("read_only", StringRef("read_only")) + .Case("write_only", StringRef("write_only")) + .Case("read_write", StringRef("read_write")) + .Default(None); +} + +Optional<StringRef> +MetadataStreamerV3::getAddressSpaceQualifier(unsigned AddressSpace) const { + switch (AddressSpace) { + case AMDGPUAS::PRIVATE_ADDRESS: + return StringRef("private"); + case AMDGPUAS::GLOBAL_ADDRESS: + return StringRef("global"); + case AMDGPUAS::CONSTANT_ADDRESS: + return StringRef("constant"); + case AMDGPUAS::LOCAL_ADDRESS: + return StringRef("local"); + case AMDGPUAS::FLAT_ADDRESS: + return StringRef("generic"); + case AMDGPUAS::REGION_ADDRESS: + return StringRef("region"); + default: + return None; + } +} + +StringRef MetadataStreamerV3::getValueKind(Type *Ty, StringRef TypeQual, + StringRef BaseTypeName) const { + if (TypeQual.find("pipe") != StringRef::npos) + return "pipe"; + + return StringSwitch<StringRef>(BaseTypeName) + .Case("image1d_t", "image") + .Case("image1d_array_t", "image") + .Case("image1d_buffer_t", "image") + .Case("image2d_t", "image") + .Case("image2d_array_t", "image") + .Case("image2d_array_depth_t", "image") + .Case("image2d_array_msaa_t", "image") + .Case("image2d_array_msaa_depth_t", "image") + .Case("image2d_depth_t", "image") + .Case("image2d_msaa_t", "image") + .Case("image2d_msaa_depth_t", "image") + .Case("image3d_t", "image") + .Case("sampler_t", "sampler") + .Case("queue_t", "queue") + .Default(isa<PointerType>(Ty) + ? (Ty->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS + ? "dynamic_shared_pointer" + : "global_buffer") + : "by_value"); +} + +StringRef MetadataStreamerV3::getValueType(Type *Ty, StringRef TypeName) const { + switch (Ty->getTypeID()) { + case Type::IntegerTyID: { + auto Signed = !TypeName.startswith("u"); + switch (Ty->getIntegerBitWidth()) { + case 8: + return Signed ? "i8" : "u8"; + case 16: + return Signed ? "i16" : "u16"; + case 32: + return Signed ? "i32" : "u32"; + case 64: + return Signed ? "i64" : "u64"; + default: + return "struct"; + } + } + case Type::HalfTyID: + return "f16"; + case Type::FloatTyID: + return "f32"; + case Type::DoubleTyID: + return "f64"; + case Type::PointerTyID: + return getValueType(Ty->getPointerElementType(), TypeName); + case Type::VectorTyID: + return getValueType(Ty->getVectorElementType(), TypeName); + default: + return "struct"; + } +} + +std::string MetadataStreamerV3::getTypeName(Type *Ty, bool Signed) const { + switch (Ty->getTypeID()) { + case Type::IntegerTyID: { + if (!Signed) + return (Twine('u') + getTypeName(Ty, true)).str(); + + auto BitWidth = Ty->getIntegerBitWidth(); + switch (BitWidth) { + case 8: + return "char"; + case 16: + return "short"; + case 32: + return "int"; + case 64: + return "long"; + default: + return (Twine('i') + Twine(BitWidth)).str(); + } + } + case Type::HalfTyID: + return "half"; + case Type::FloatTyID: + return "float"; + case Type::DoubleTyID: + return "double"; + case Type::VectorTyID: { + auto VecTy = cast<VectorType>(Ty); + auto ElTy = VecTy->getElementType(); + auto NumElements = VecTy->getVectorNumElements(); + return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str(); + } + default: + return "unknown"; + } +} + +std::shared_ptr<msgpack::ArrayNode> +MetadataStreamerV3::getWorkGroupDimensions(MDNode *Node) const { + auto Dims = std::make_shared<msgpack::ArrayNode>(); + if (Node->getNumOperands() != 3) + return Dims; + + for (auto &Op : Node->operands()) + Dims->push_back(std::make_shared<msgpack::ScalarNode>( + mdconst::extract<ConstantInt>(Op)->getZExtValue())); + return Dims; +} + +void MetadataStreamerV3::emitVersion() { + auto Version = std::make_shared<msgpack::ArrayNode>(); + Version->push_back(std::make_shared<msgpack::ScalarNode>(V3::VersionMajor)); + Version->push_back(std::make_shared<msgpack::ScalarNode>(V3::VersionMinor)); + getRootMetadata("amdhsa.version") = std::move(Version); +} + +void MetadataStreamerV3::emitPrintf(const Module &Mod) { + auto Node = Mod.getNamedMetadata("llvm.printf.fmts"); + if (!Node) + return; + + auto Printf = std::make_shared<msgpack::ArrayNode>(); + for (auto Op : Node->operands()) + if (Op->getNumOperands()) + Printf->push_back(std::make_shared<msgpack::ScalarNode>( + cast<MDString>(Op->getOperand(0))->getString())); + getRootMetadata("amdhsa.printf") = std::move(Printf); +} + +void MetadataStreamerV3::emitKernelLanguage(const Function &Func, + msgpack::MapNode &Kern) { + // TODO: What about other languages? + auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version"); + if (!Node || !Node->getNumOperands()) + return; + auto Op0 = Node->getOperand(0); + if (Op0->getNumOperands() <= 1) + return; + + Kern[".language"] = std::make_shared<msgpack::ScalarNode>("OpenCL C"); + auto LanguageVersion = std::make_shared<msgpack::ArrayNode>(); + LanguageVersion->push_back(std::make_shared<msgpack::ScalarNode>( + mdconst::extract<ConstantInt>(Op0->getOperand(0))->getZExtValue())); + LanguageVersion->push_back(std::make_shared<msgpack::ScalarNode>( + mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue())); + Kern[".language_version"] = std::move(LanguageVersion); +} + +void MetadataStreamerV3::emitKernelAttrs(const Function &Func, + msgpack::MapNode &Kern) { + + if (auto Node = Func.getMetadata("reqd_work_group_size")) + Kern[".reqd_workgroup_size"] = getWorkGroupDimensions(Node); + if (auto Node = Func.getMetadata("work_group_size_hint")) + Kern[".workgroup_size_hint"] = getWorkGroupDimensions(Node); + if (auto Node = Func.getMetadata("vec_type_hint")) { + Kern[".vec_type_hint"] = std::make_shared<msgpack::ScalarNode>(getTypeName( + cast<ValueAsMetadata>(Node->getOperand(0))->getType(), + mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue())); + } + if (Func.hasFnAttribute("runtime-handle")) { + Kern[".device_enqueue_symbol"] = std::make_shared<msgpack::ScalarNode>( + Func.getFnAttribute("runtime-handle").getValueAsString().str()); + } +} + +void MetadataStreamerV3::emitKernelArgs(const Function &Func, + msgpack::MapNode &Kern) { + unsigned Offset = 0; + auto Args = std::make_shared<msgpack::ArrayNode>(); + for (auto &Arg : Func.args()) + emitKernelArg(Arg, Offset, *Args); + + emitHiddenKernelArgs(Func, Offset, *Args); + + // TODO: What about other languages? + if (Func.getParent()->getNamedMetadata("opencl.ocl.version")) { + auto &DL = Func.getParent()->getDataLayout(); + auto Int64Ty = Type::getInt64Ty(Func.getContext()); + + emitKernelArg(DL, Int64Ty, "hidden_global_offset_x", Offset, *Args); + emitKernelArg(DL, Int64Ty, "hidden_global_offset_y", Offset, *Args); + emitKernelArg(DL, Int64Ty, "hidden_global_offset_z", Offset, *Args); + + auto Int8PtrTy = + Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); + + // Emit "printf buffer" argument if printf is used, otherwise emit dummy + // "none" argument. + if (Func.getParent()->getNamedMetadata("llvm.printf.fmts")) + emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, *Args); + else + emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args); + + // Emit "default queue" and "completion action" arguments if enqueue kernel + // is used, otherwise emit dummy "none" arguments. + if (Func.hasFnAttribute("calls-enqueue-kernel")) { + emitKernelArg(DL, Int8PtrTy, "hidden_default_queue", Offset, *Args); + emitKernelArg(DL, Int8PtrTy, "hidden_completion_action", Offset, *Args); + } else { + emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args); + emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, *Args); + } + } + + Kern[".args"] = std::move(Args); +} + +void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset, + msgpack::ArrayNode &Args) { + auto Func = Arg.getParent(); + auto ArgNo = Arg.getArgNo(); + const MDNode *Node; + + StringRef Name; + Node = Func->getMetadata("kernel_arg_name"); + if (Node && ArgNo < Node->getNumOperands()) + Name = cast<MDString>(Node->getOperand(ArgNo))->getString(); + else if (Arg.hasName()) + Name = Arg.getName(); + + StringRef TypeName; + Node = Func->getMetadata("kernel_arg_type"); + if (Node && ArgNo < Node->getNumOperands()) + TypeName = cast<MDString>(Node->getOperand(ArgNo))->getString(); + + StringRef BaseTypeName; + Node = Func->getMetadata("kernel_arg_base_type"); + if (Node && ArgNo < Node->getNumOperands()) + BaseTypeName = cast<MDString>(Node->getOperand(ArgNo))->getString(); + + StringRef AccQual; + if (Arg.getType()->isPointerTy() && Arg.onlyReadsMemory() && + Arg.hasNoAliasAttr()) { + AccQual = "read_only"; + } else { + Node = Func->getMetadata("kernel_arg_access_qual"); + if (Node && ArgNo < Node->getNumOperands()) + AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString(); + } + + StringRef TypeQual; + Node = Func->getMetadata("kernel_arg_type_qual"); + if (Node && ArgNo < Node->getNumOperands()) + TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString(); + + Type *Ty = Arg.getType(); + const DataLayout &DL = Func->getParent()->getDataLayout(); + + unsigned PointeeAlign = 0; + if (auto PtrTy = dyn_cast<PointerType>(Ty)) { + if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + PointeeAlign = Arg.getParamAlignment(); + if (PointeeAlign == 0) + PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType()); + } + } + + emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(), + getValueKind(Arg.getType(), TypeQual, BaseTypeName), Offset, + Args, PointeeAlign, Name, TypeName, BaseTypeName, AccQual, + TypeQual); +} + +void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty, + StringRef ValueKind, unsigned &Offset, + msgpack::ArrayNode &Args, + unsigned PointeeAlign, StringRef Name, + StringRef TypeName, + StringRef BaseTypeName, + StringRef AccQual, StringRef TypeQual) { + auto ArgPtr = std::make_shared<msgpack::MapNode>(); + auto &Arg = *ArgPtr; + + if (!Name.empty()) + Arg[".name"] = std::make_shared<msgpack::ScalarNode>(Name); + if (!TypeName.empty()) + Arg[".type_name"] = std::make_shared<msgpack::ScalarNode>(TypeName); + auto Size = DL.getTypeAllocSize(Ty); + auto Align = DL.getABITypeAlignment(Ty); + Arg[".size"] = std::make_shared<msgpack::ScalarNode>(Size); + Offset = alignTo(Offset, Align); + Arg[".offset"] = std::make_shared<msgpack::ScalarNode>(Offset); + Offset += Size; + Arg[".value_kind"] = std::make_shared<msgpack::ScalarNode>(ValueKind); + Arg[".value_type"] = + std::make_shared<msgpack::ScalarNode>(getValueType(Ty, BaseTypeName)); + if (PointeeAlign) + Arg[".pointee_align"] = std::make_shared<msgpack::ScalarNode>(PointeeAlign); + + if (auto PtrTy = dyn_cast<PointerType>(Ty)) + if (auto Qualifier = getAddressSpaceQualifier(PtrTy->getAddressSpace())) + Arg[".address_space"] = std::make_shared<msgpack::ScalarNode>(*Qualifier); + + if (auto AQ = getAccessQualifier(AccQual)) + Arg[".access"] = std::make_shared<msgpack::ScalarNode>(*AQ); + + // TODO: Emit Arg[".actual_access"]. + + SmallVector<StringRef, 1> SplitTypeQuals; + TypeQual.split(SplitTypeQuals, " ", -1, false); + for (StringRef Key : SplitTypeQuals) { + if (Key == "const") + Arg[".is_const"] = std::make_shared<msgpack::ScalarNode>(true); + else if (Key == "restrict") + Arg[".is_restrict"] = std::make_shared<msgpack::ScalarNode>(true); + else if (Key == "volatile") + Arg[".is_volatile"] = std::make_shared<msgpack::ScalarNode>(true); + else if (Key == "pipe") + Arg[".is_pipe"] = std::make_shared<msgpack::ScalarNode>(true); + } + + Args.push_back(std::move(ArgPtr)); +} + +void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func, + unsigned &Offset, + msgpack::ArrayNode &Args) { + int HiddenArgNumBytes = + getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0); + + if (!HiddenArgNumBytes) + return; + + auto &DL = Func.getParent()->getDataLayout(); + auto Int64Ty = Type::getInt64Ty(Func.getContext()); + + if (HiddenArgNumBytes >= 8) + emitKernelArg(DL, Int64Ty, "hidden_global_offset_x", Offset, Args); + if (HiddenArgNumBytes >= 16) + emitKernelArg(DL, Int64Ty, "hidden_global_offset_y", Offset, Args); + if (HiddenArgNumBytes >= 24) + emitKernelArg(DL, Int64Ty, "hidden_global_offset_z", Offset, Args); + + auto Int8PtrTy = + Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS); + + // Emit "printf buffer" argument if printf is used, otherwise emit dummy + // "none" argument. + if (HiddenArgNumBytes >= 32) { + if (Func.getParent()->getNamedMetadata("llvm.printf.fmts")) + emitKernelArg(DL, Int8PtrTy, "hidden_printf_buffer", Offset, Args); + else + emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args); + } + + // Emit "default queue" and "completion action" arguments if enqueue kernel is + // used, otherwise emit dummy "none" arguments. + if (HiddenArgNumBytes >= 48) { + if (Func.hasFnAttribute("calls-enqueue-kernel")) { + emitKernelArg(DL, Int8PtrTy, "hidden_default_queue", Offset, Args); + emitKernelArg(DL, Int8PtrTy, "hidden_completion_action", Offset, Args); + } else { + emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args); + emitKernelArg(DL, Int8PtrTy, "hidden_none", Offset, Args); + } + } +} + +std::shared_ptr<msgpack::MapNode> +MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const { + const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(); + const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + const Function &F = MF.getFunction(); + + auto HSAKernelProps = std::make_shared<msgpack::MapNode>(); + auto &Kern = *HSAKernelProps; + + unsigned MaxKernArgAlign; + Kern[".kernarg_segment_size"] = std::make_shared<msgpack::ScalarNode>( + STM.getKernArgSegmentSize(F, MaxKernArgAlign)); + Kern[".group_segment_fixed_size"] = + std::make_shared<msgpack::ScalarNode>(ProgramInfo.LDSSize); + Kern[".private_segment_fixed_size"] = + std::make_shared<msgpack::ScalarNode>(ProgramInfo.ScratchSize); + Kern[".kernarg_segment_align"] = + std::make_shared<msgpack::ScalarNode>(std::max(uint32_t(4), MaxKernArgAlign)); + Kern[".wavefront_size"] = + std::make_shared<msgpack::ScalarNode>(STM.getWavefrontSize()); + Kern[".sgpr_count"] = std::make_shared<msgpack::ScalarNode>(ProgramInfo.NumSGPR); + Kern[".vgpr_count"] = std::make_shared<msgpack::ScalarNode>(ProgramInfo.NumVGPR); + Kern[".max_flat_workgroup_size"] = + std::make_shared<msgpack::ScalarNode>(MFI.getMaxFlatWorkGroupSize()); + Kern[".sgpr_spill_count"] = + std::make_shared<msgpack::ScalarNode>(MFI.getNumSpilledSGPRs()); + Kern[".vgpr_spill_count"] = + std::make_shared<msgpack::ScalarNode>(MFI.getNumSpilledVGPRs()); + + return HSAKernelProps; +} + +bool MetadataStreamerV3::emitTo(AMDGPUTargetStreamer &TargetStreamer) { + return TargetStreamer.EmitHSAMetadata(getHSAMetadataRoot(), true); +} + +void MetadataStreamerV3::begin(const Module &Mod) { + emitVersion(); + emitPrintf(Mod); + getRootMetadata("amdhsa.kernels").reset(new msgpack::ArrayNode()); +} + +void MetadataStreamerV3::end() { + std::string HSAMetadataString; + raw_string_ostream StrOS(HSAMetadataString); + yaml::Output YOut(StrOS); + YOut << HSAMetadataRoot; + + if (DumpHSAMetadata) + dump(StrOS.str()); + if (VerifyHSAMetadata) + verify(StrOS.str()); +} + +void MetadataStreamerV3::emitKernel(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) { + auto &Func = MF.getFunction(); + auto KernelProps = getHSAKernelProps(MF, ProgramInfo); + + assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL || + Func.getCallingConv() == CallingConv::SPIR_KERNEL); + + auto &KernelsNode = getRootMetadata("amdhsa.kernels"); + auto Kernels = cast<msgpack::ArrayNode>(KernelsNode.get()); + + { + auto &Kern = *KernelProps; + Kern[".name"] = std::make_shared<msgpack::ScalarNode>(Func.getName()); + Kern[".symbol"] = std::make_shared<msgpack::ScalarNode>( + (Twine(Func.getName()) + Twine(".kd")).str()); + emitKernelLanguage(Func, Kern); + emitKernelAttrs(Func, Kern); + emitKernelArgs(Func, Kern); + } + + Kernels->push_back(std::move(KernelProps)); +} + } // end namespace HSAMD } // end namespace AMDGPU } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index 3424c956d7816..afc09baf952d6 100644 --- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -19,10 +19,12 @@ #include "AMDGPU.h" #include "AMDKernelCodeT.h" #include "llvm/ADT/StringRef.h" +#include "llvm/BinaryFormat/MsgPackTypes.h" #include "llvm/Support/AMDGPUMetadata.h" namespace llvm { +class AMDGPUTargetStreamer; class Argument; class DataLayout; class Function; @@ -34,10 +36,94 @@ class Type; namespace AMDGPU { namespace HSAMD { -class MetadataStreamer final { +class MetadataStreamer { +public: + virtual ~MetadataStreamer(){}; + + virtual bool emitTo(AMDGPUTargetStreamer &TargetStreamer) = 0; + + virtual void begin(const Module &Mod) = 0; + + virtual void end() = 0; + + virtual void emitKernel(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) = 0; +}; + +class MetadataStreamerV3 final : public MetadataStreamer { +private: + std::shared_ptr<msgpack::Node> HSAMetadataRoot = + std::make_shared<msgpack::MapNode>(); + + void dump(StringRef HSAMetadataString) const; + + void verify(StringRef HSAMetadataString) const; + + Optional<StringRef> getAccessQualifier(StringRef AccQual) const; + + Optional<StringRef> getAddressSpaceQualifier(unsigned AddressSpace) const; + + StringRef getValueKind(Type *Ty, StringRef TypeQual, + StringRef BaseTypeName) const; + + StringRef getValueType(Type *Ty, StringRef TypeName) const; + + std::string getTypeName(Type *Ty, bool Signed) const; + + std::shared_ptr<msgpack::ArrayNode> + getWorkGroupDimensions(MDNode *Node) const; + + std::shared_ptr<msgpack::MapNode> + getHSAKernelProps(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const; + + void emitVersion(); + + void emitPrintf(const Module &Mod); + + void emitKernelLanguage(const Function &Func, msgpack::MapNode &Kern); + + void emitKernelAttrs(const Function &Func, msgpack::MapNode &Kern); + + void emitKernelArgs(const Function &Func, msgpack::MapNode &Kern); + + void emitKernelArg(const Argument &Arg, unsigned &Offset, + msgpack::ArrayNode &Args); + + void emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind, + unsigned &Offset, msgpack::ArrayNode &Args, + unsigned PointeeAlign = 0, StringRef Name = "", + StringRef TypeName = "", StringRef BaseTypeName = "", + StringRef AccQual = "", StringRef TypeQual = ""); + + void emitHiddenKernelArgs(const Function &Func, unsigned &Offset, + msgpack::ArrayNode &Args); + + std::shared_ptr<msgpack::Node> &getRootMetadata(StringRef Key) { + return (*cast<msgpack::MapNode>(HSAMetadataRoot.get()))[Key]; + } + + std::shared_ptr<msgpack::Node> &getHSAMetadataRoot() { + return HSAMetadataRoot; + } + +public: + MetadataStreamerV3() = default; + ~MetadataStreamerV3() = default; + + bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override; + + void begin(const Module &Mod) override; + + void end() override; + + void emitKernel(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) override; +}; + +class MetadataStreamerV2 final : public MetadataStreamer { private: Metadata HSAMetadata; - AMDGPUAS AMDGPUASI; void dump(StringRef HSAMetadataString) const; @@ -45,7 +131,7 @@ private: AccessQualifier getAccessQualifier(StringRef AccQual) const; - AddressSpaceQualifier getAddressSpaceQualifer(unsigned AddressSpace) const; + AddressSpaceQualifier getAddressSpaceQualifier(unsigned AddressSpace) const; ValueKind getValueKind(Type *Ty, StringRef TypeQual, StringRef BaseTypeName) const; @@ -83,19 +169,22 @@ private: void emitHiddenKernelArgs(const Function &Func); -public: - MetadataStreamer() = default; - ~MetadataStreamer() = default; - const Metadata &getHSAMetadata() const { return HSAMetadata; } - void begin(const Module &Mod); +public: + MetadataStreamerV2() = default; + ~MetadataStreamerV2() = default; + + bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override; + + void begin(const Module &Mod) override; - void end(); + void end() override; - void emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo); + void emitKernel(const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) override; }; } // end namespace HSAMD diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index f25f4d4693eac..a0a045e72a58f 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -29,7 +29,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/ISDOpcodes.h" @@ -72,14 +72,12 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can // make the right decision when generating code for different targets. const GCNSubtarget *Subtarget; - AMDGPUAS AMDGPUASI; bool EnableLateStructurizeCFG; public: explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr, CodeGenOpt::Level OptLevel = CodeGenOpt::Default) : SelectionDAGISel(*TM, OptLevel) { - AMDGPUASI = AMDGPU::getAMDGPUAS(*TM); EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG; } ~AMDGPUDAGToDAGISel() override = default; @@ -87,7 +85,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AMDGPUArgumentUsageInfo>(); AU.addRequired<AMDGPUPerfHintAnalysis>(); - AU.addRequired<DivergenceAnalysis>(); + AU.addRequired<LegacyDivergenceAnalysis>(); SelectionDAGISel::getAnalysisUsage(AU); } @@ -103,9 +101,12 @@ private: std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const; bool isNoNanSrc(SDValue N) const; bool isInlineImmediate(const SDNode *N) const; - + bool isVGPRImm(const SDNode *N) const; + bool isUniformLoad(const SDNode *N) const; bool isUniformBr(const SDNode *N) const; + MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; + SDNode *glueCopyToM0(SDNode *N) const; const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; @@ -140,13 +141,6 @@ private: SDValue &Offset, SDValue &SLC) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset) const; - bool SelectMUBUFConstant(SDValue Constant, - SDValue &SOffset, - SDValue &ImmOffset) const; - bool SelectMUBUFIntrinsicOffset(SDValue Offset, SDValue &SOffset, - SDValue &ImmOffset) const; - bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset, - SDValue &ImmOffset, SDValue &VOffset) const; bool SelectFlatAtomic(SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const; @@ -224,7 +218,6 @@ protected: class R600DAGToDAGISel : public AMDGPUDAGToDAGISel { const R600Subtarget *Subtarget; - AMDGPUAS AMDGPUASI; bool isConstantLoad(const MemSDNode *N, int cbID) const; bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); @@ -232,9 +225,7 @@ class R600DAGToDAGISel : public AMDGPUDAGToDAGISel { SDValue& Offset); public: explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) : - AMDGPUDAGToDAGISel(TM, OptLevel) { - AMDGPUASI = AMDGPU::getAMDGPUAS(*TM); - } + AMDGPUDAGToDAGISel(TM, OptLevel) {} void Select(SDNode *N) override; @@ -251,12 +242,12 @@ protected: } // end anonymous namespace -INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "isel", +INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel", "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo) INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) -INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "isel", +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) +INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel", "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) /// This pass converts a legalized DAG into a AMDGPU-specific @@ -350,7 +341,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, } SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { - if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS || + if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS || !Subtarget->ldsRequiresM0Init()) return N; @@ -372,6 +363,22 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); } +MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, + EVT VT) const { + SDNode *Lo = CurDAG->getMachineNode( + AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, MVT::i32)); + SDNode *Hi = + CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(Imm >> 32, DL, MVT::i32)); + const SDValue Ops[] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), + SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)}; + + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops); +} + static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { switch (NumVectorElts) { case 1: @@ -557,19 +564,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { } SDLoc DL(N); - SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, - MVT::i32)); - SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(Imm >> 32, DL, MVT::i32)); - const SDValue Ops[] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), - SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) - }; - - ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, - N->getValueType(0), Ops)); + ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0))); return; } case ISD::LOAD: @@ -641,6 +636,20 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case AMDGPUISD::ATOMIC_CMP_SWAP: SelectATOMIC_CMP_SWAP(N); return; + case AMDGPUISD::CVT_PKRTZ_F16_F32: + case AMDGPUISD::CVT_PKNORM_I16_F32: + case AMDGPUISD::CVT_PKNORM_U16_F32: + case AMDGPUISD::CVT_PK_U16_U32: + case AMDGPUISD::CVT_PK_I16_I32: { + // Hack around using a legal type if f16 is illegal. + if (N->getValueType(0) == MVT::i32) { + MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16; + N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT), + { N->getOperand(0), N->getOperand(1) }); + SelectCode(N); + return; + } + } } SelectCode(N); @@ -969,8 +978,6 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, // default case - // FIXME: This is broken on SI where we still need to check if the base - // pointer is positive here. Base = Addr; Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8); Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8); @@ -1000,55 +1007,72 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1); SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); + ConstantSDNode *C1 = nullptr; + SDValue N0 = Addr; if (CurDAG->isBaseWithConstantOffset(Addr)) { - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + C1 = cast<ConstantSDNode>(Addr.getOperand(1)); + if (isUInt<32>(C1->getZExtValue())) + N0 = Addr.getOperand(0); + else + C1 = nullptr; + } - if (N0.getOpcode() == ISD::ADD) { - // (add (add N2, N3), C1) -> addr64 - SDValue N2 = N0.getOperand(0); - SDValue N3 = N0.getOperand(1); - Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); + if (N0.getOpcode() == ISD::ADD) { + // (add N2, N3) -> addr64, or + // (add (add N2, N3), C1) -> addr64 + SDValue N2 = N0.getOperand(0); + SDValue N3 = N0.getOperand(1); + Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); + + if (N2->isDivergent()) { + if (N3->isDivergent()) { + // Both N2 and N3 are divergent. Use N0 (the result of the add) as the + // addr64, and construct the resource from a 0 address. + Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0); + VAddr = N0; + } else { + // N2 is divergent, N3 is not. + Ptr = N3; + VAddr = N2; + } + } else { + // N2 is not divergent. Ptr = N2; VAddr = N3; - } else { - // (add N0, C1) -> offset - VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); - Ptr = N0; } - - if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) { - Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); - return true; - } - - if (isUInt<32>(C1->getZExtValue())) { - // Illegal offset, store it in soffset. - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); - SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), - 0); - return true; - } - } - - if (Addr.getOpcode() == ISD::ADD) { - // (add N0, N1) -> addr64 - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + } else if (N0->isDivergent()) { + // N0 is divergent. Use it as the addr64, and construct the resource from a + // 0 address. + Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0); + VAddr = N0; Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); + } else { + // N0 -> offset, or + // (N0 + C1) -> offset + VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); Ptr = N0; - VAddr = N1; + } + + if (!C1) { + // No offset. Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); return true; } - // default case -> offset - VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); - Ptr = Addr; - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) { + // Legal offset for instruction. + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); + return true; + } + // Illegal offset, store it in soffset. + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + SOffset = + SDValue(CurDAG->getMachineNode( + AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), + 0); return true; } @@ -1252,101 +1276,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); } -bool AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant, - SDValue &SOffset, - SDValue &ImmOffset) const { - SDLoc DL(Constant); - const uint32_t Align = 4; - const uint32_t MaxImm = alignDown(4095, Align); - uint32_t Imm = cast<ConstantSDNode>(Constant)->getZExtValue(); - uint32_t Overflow = 0; - - if (Imm > MaxImm) { - if (Imm <= MaxImm + 64) { - // Use an SOffset inline constant for 4..64 - Overflow = Imm - MaxImm; - Imm = MaxImm; - } else { - // Try to keep the same value in SOffset for adjacent loads, so that - // the corresponding register contents can be re-used. - // - // Load values with all low-bits (except for alignment bits) set into - // SOffset, so that a larger range of values can be covered using - // s_movk_i32. - // - // Atomic operations fail to work correctly when individual address - // components are unaligned, even if their sum is aligned. - uint32_t High = (Imm + Align) & ~4095; - uint32_t Low = (Imm + Align) & 4095; - Imm = Low; - Overflow = High - Align; - } - } - - // There is a hardware bug in SI and CI which prevents address clamping in - // MUBUF instructions from working correctly with SOffsets. The immediate - // offset is unaffected. - if (Overflow > 0 && - Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) - return false; - - ImmOffset = CurDAG->getTargetConstant(Imm, DL, MVT::i16); - - if (Overflow <= 64) - SOffset = CurDAG->getTargetConstant(Overflow, DL, MVT::i32); - else - SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getTargetConstant(Overflow, DL, MVT::i32)), - 0); - - return true; -} - -bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicOffset(SDValue Offset, - SDValue &SOffset, - SDValue &ImmOffset) const { - SDLoc DL(Offset); - - if (!isa<ConstantSDNode>(Offset)) - return false; - - return SelectMUBUFConstant(Offset, SOffset, ImmOffset); -} - -bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset, - SDValue &SOffset, - SDValue &ImmOffset, - SDValue &VOffset) const { - SDLoc DL(Offset); - - // Don't generate an unnecessary voffset for constant offsets. - if (isa<ConstantSDNode>(Offset)) { - SDValue Tmp1, Tmp2; - - // When necessary, use a voffset in <= CI anyway to work around a hardware - // bug. - if (Subtarget->getGeneration() > AMDGPUSubtarget::SEA_ISLANDS || - SelectMUBUFConstant(Offset, Tmp1, Tmp2)) - return false; - } - - if (CurDAG->isBaseWithConstantOffset(Offset)) { - SDValue N0 = Offset.getOperand(0); - SDValue N1 = Offset.getOperand(1); - if (cast<ConstantSDNode>(N1)->getSExtValue() >= 0 && - SelectMUBUFConstant(N1, SOffset, ImmOffset)) { - VOffset = N0; - return true; - } - } - - SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); - ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); - VOffset = Offset; - - return true; -} - template <bool IsSigned> bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr, SDValue &VAddr, @@ -1451,7 +1380,11 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, bool &Imm) const { SDLoc SL(Addr); - if (CurDAG->isBaseWithConstantOffset(Addr)) { + // A 32-bit (address + offset) should not cause unsigned 32-bit integer + // wraparound, because s_load instructions perform the addition in 64 bits. + if ((Addr.getValueType() != MVT::i32 || + Addr->getFlags().hasNoUnsignedWrap()) && + CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); @@ -1521,9 +1454,13 @@ bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, ConstantSDNode *C1 = cast<ConstantSDNode>(N1); // (add n0, c0) - Base = N0; - Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32); - return true; + // Don't peel off the offset (c0) if doing so could possibly lead + // the base (n0) to be negative. + if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0)) { + Base = N0; + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32); + return true; + } } if (isa<ConstantSDNode>(Index)) @@ -1764,7 +1701,7 @@ void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) { void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { MemSDNode *Mem = cast<MemSDNode>(N); unsigned AS = Mem->getAddressSpace(); - if (AS == AMDGPUASI.FLAT_ADDRESS) { + if (AS == AMDGPUAS::FLAT_ADDRESS) { SelectCode(N); return; } @@ -1812,9 +1749,8 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { return; } - MachineSDNode::mmo_iterator MMOs = MF->allocateMemRefsArray(1); - *MMOs = Mem->getMemOperand(); - CmpSwap->setMemRefs(MMOs, MMOs + 1); + MachineMemOperand *MMO = Mem->getMemOperand(); + CurDAG->setNodeMemRefs(CmpSwap, {MMO}); unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1; SDValue Extract @@ -2113,6 +2049,80 @@ bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const { return isExtractHiElt(In, Src); } +bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { + if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { + return false; + } + const SIRegisterInfo *SIRI = + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); + const SIInstrInfo * SII = + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + + unsigned Limit = 0; + bool AllUsesAcceptSReg = true; + for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); + Limit < 10 && U != E; ++U, ++Limit) { + const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); + + // If the register class is unknown, it could be an unknown + // register class that needs to be an SGPR, e.g. an inline asm + // constraint + if (!RC || SIRI->isSGPRClass(RC)) + return false; + + if (RC != &AMDGPU::VS_32RegClass) { + AllUsesAcceptSReg = false; + SDNode * User = *U; + if (User->isMachineOpcode()) { + unsigned Opc = User->getMachineOpcode(); + MCInstrDesc Desc = SII->get(Opc); + if (Desc.isCommutable()) { + unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo(); + unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; + if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) { + unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs(); + const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo); + if (CommutedRC == &AMDGPU::VS_32RegClass) + AllUsesAcceptSReg = true; + } + } + } + // If "AllUsesAcceptSReg == false" so far we haven't suceeded + // commuting current user. This means have at least one use + // that strictly require VGPR. Thus, we will not attempt to commute + // other user instructions. + if (!AllUsesAcceptSReg) + break; + } + } + return !AllUsesAcceptSReg && (Limit < 10); +} + +bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const { + auto Ld = cast<LoadSDNode>(N); + + return Ld->getAlignment() >= 4 && + ( + ( + ( + Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || + Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT + ) + && + !N->isDivergent() + ) + || + ( + Subtarget->getScalarizeGlobalBehavior() && + Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && + !Ld->isVolatile() && + !N->isDivergent() && + static_cast<const SITargetLowering *>( + getTargetLowering())->isMemOpHasNoClobberedMemOperand(N) + ) + ); +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); @@ -2148,10 +2158,10 @@ bool R600DAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const { if (!N->readMem()) return false; if (CbId == -1) - return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || - N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT; + return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || + N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT; - return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId; + return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId; } bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 21e44e9589d3c..6951c915b1772 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -128,10 +128,8 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { } unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) { - KnownBits Known; EVT VT = Op.getValueType(); - DAG.computeKnownBits(Op, Known); - + KnownBits Known = DAG.computeKnownBits(Op); return VT.getSizeInBits() - Known.countMinLeadingZeros(); } @@ -146,7 +144,6 @@ unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) { AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { - AMDGPUASI = AMDGPU::getAMDGPUAS(TM); // Lower floating point store/load to integer store/load to reduce the number // of patterns in tablegen. setOperationAction(ISD::LOAD, MVT::f32, Promote); @@ -318,6 +315,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FLOG, MVT::f32, Custom); setOperationAction(ISD::FLOG10, MVT::f32, Custom); + setOperationAction(ISD::FEXP, MVT::f32, Custom); setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); @@ -450,6 +448,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); + setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); @@ -470,6 +469,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); setOperationAction(ISD::SETCC, VT, Expand); + setOperationAction(ISD::FCANONICALIZE, VT, Expand); } // This causes using an unrolled select operation rather than expansion with @@ -550,6 +550,8 @@ static bool fnegFoldsIntoOp(unsigned Opc) { case ISD::FMAD: case ISD::FMINNUM: case ISD::FMAXNUM: + case ISD::FMINNUM_IEEE: + case ISD::FMAXNUM_IEEE: case ISD::FSIN: case ISD::FTRUNC: case ISD::FRINT: @@ -562,6 +564,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) { case AMDGPUISD::FMUL_LEGACY: case AMDGPUISD::FMIN_LEGACY: case AMDGPUISD::FMAX_LEGACY: + case AMDGPUISD::FMED3: return true; default: return false; @@ -650,8 +653,11 @@ bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { } bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, - ISD::LoadExtType, + ISD::LoadExtType ExtTy, EVT NewVT) const { + // TODO: This may be worth removing. Check regression tests for diffs. + if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT)) + return false; unsigned NewSize = NewVT.getStoreSizeInBits(); @@ -662,6 +668,18 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, EVT OldVT = N->getValueType(0); unsigned OldSize = OldVT.getStoreSizeInBits(); + MemSDNode *MN = cast<MemSDNode>(N); + unsigned AS = MN->getAddressSpace(); + // Do not shrink an aligned scalar load to sub-dword. + // Scalar engine cannot do sub-dword loads. + if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 && + (AS == AMDGPUAS::CONSTANT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || + (isa<LoadSDNode>(N) && + AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) && + AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand())) + return false; + // Don't produce extloads from sub 32-bit types. SI doesn't have scalar // extloads, so doing one requires using a buffer_load. In cases where we // still couldn't use a scalar load, using the wider load shouldn't really @@ -722,7 +740,7 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const { { const LoadSDNode * L = dyn_cast<LoadSDNode>(N); if (L->getMemOperand()->getAddrSpace() - == AMDGPUASI.CONSTANT_ADDRESS_32BIT) + == AMDGPUAS::CONSTANT_ADDRESS_32BIT) return true; return false; } @@ -1140,6 +1158,8 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F); case ISD::FLOG10: return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F); + case ISD::FEXP: + return lowerFEXP(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); @@ -1188,8 +1208,8 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = G->getGlobal(); - if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS || - G->getAddressSpace() == AMDGPUASI.REGION_ADDRESS) { + if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) { if (!MFI->isEntryFunction()) { const Function &Fn = DAG.getMachineFunction().getFunction(); DiagnosticInfoUnsupported BadLDSDecl( @@ -2213,6 +2233,34 @@ SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand); } +// Return M_LOG2E of appropriate type +static SDValue getLog2EVal(SelectionDAG &DAG, const SDLoc &SL, EVT VT) { + switch (VT.getScalarType().getSimpleVT().SimpleTy) { + case MVT::f32: + return DAG.getConstantFP(1.44269504088896340735992468100189214f, SL, VT); + case MVT::f16: + return DAG.getConstantFP( + APFloat(APFloat::IEEEhalf(), "1.44269504088896340735992468100189214"), + SL, VT); + case MVT::f64: + return DAG.getConstantFP( + APFloat(APFloat::IEEEdouble(), "0x1.71547652b82fep+0"), SL, VT); + default: + llvm_unreachable("unsupported fp type"); + } +} + +// exp2(M_LOG2E_F * f); +SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc SL(Op); + SDValue Src = Op.getOperand(0); + + const SDValue K = getLog2EVal(DAG, SL, VT); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags()); + return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags()); +} + static bool isCtlzOpc(unsigned Opc) { return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; } @@ -2669,21 +2717,33 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) { AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24; } -static bool simplifyI24(SDNode *Node24, unsigned OpIdx, - TargetLowering::DAGCombinerInfo &DCI) { - +static SDValue simplifyI24(SDNode *Node24, + TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; - SDValue Op = Node24->getOperand(OpIdx); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - EVT VT = Op.getValueType(); + SDValue LHS = Node24->getOperand(0); + SDValue RHS = Node24->getOperand(1); - APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); - APInt KnownZero, KnownOne; - TargetLowering::TargetLoweringOpt TLO(DAG, true, true); - if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO)) - return true; + APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24); - return false; + // First try to simplify using GetDemandedBits which allows the operands to + // have other uses, but will only perform simplifications that involve + // bypassing some nodes for this user. + SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded); + SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded); + if (DemandedLHS || DemandedRHS) + return DAG.getNode(Node24->getOpcode(), SDLoc(Node24), Node24->getVTList(), + DemandedLHS ? DemandedLHS : LHS, + DemandedRHS ? DemandedRHS : RHS); + + // Now try SimplifyDemandedBits which can simplify the nodes used by our + // operands if this node is the only user. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI)) + return SDValue(Node24, 0); + if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI)) + return SDValue(Node24, 0); + + return SDValue(); } template <typename IntTy> @@ -2920,8 +2980,7 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, // shl (ext x) => zext (shl x), if shift does not overflow int if (VT != MVT::i64) break; - KnownBits Known; - DAG.computeKnownBits(X, Known); + KnownBits Known = DAG.computeKnownBits(X); unsigned LZ = Known.countMinLeadingZeros(); if (LZ < RHSVal) break; @@ -3080,8 +3139,7 @@ SDValue AMDGPUTargetLowering::performTruncateCombine( Src.getOpcode() == ISD::SRA || Src.getOpcode() == ISD::SHL)) { SDValue Amt = Src.getOperand(1); - KnownBits Known; - DAG.computeKnownBits(Amt, Known); + KnownBits Known = DAG.computeKnownBits(Amt); unsigned Size = VT.getScalarSizeInBits(); if ((Known.isConstant() && Known.getConstant().ule(Size)) || (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) { @@ -3233,8 +3291,8 @@ SDValue AMDGPUTargetLowering::performMulLoHi24Combine( SelectionDAG &DAG = DCI.DAG; // Simplify demanded bits before splitting into multiple users. - if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI)) - return SDValue(); + if (SDValue V = simplifyI24(N, DCI)) + return V; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -3449,9 +3507,27 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI); } -static bool isConstantFPZero(SDValue N) { - if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) - return C->isZero() && !C->isNegative(); +static bool isInv2Pi(const APFloat &APF) { + static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118)); + static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983)); + static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882)); + + return APF.bitwiseIsEqual(KF16) || + APF.bitwiseIsEqual(KF32) || + APF.bitwiseIsEqual(KF64); +} + +// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an +// additional cost to negate them. +bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const { + if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) { + if (C->isZero() && !C->isNegative()) + return true; + + if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF())) + return true; + } + return false; } @@ -3461,6 +3537,10 @@ static unsigned inverseMinMax(unsigned Opc) { return ISD::FMINNUM; case ISD::FMINNUM: return ISD::FMAXNUM; + case ISD::FMAXNUM_IEEE: + return ISD::FMINNUM_IEEE; + case ISD::FMINNUM_IEEE: + return ISD::FMAXNUM_IEEE; case AMDGPUISD::FMAX_LEGACY: return AMDGPUISD::FMIN_LEGACY; case AMDGPUISD::FMIN_LEGACY: @@ -3566,6 +3646,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, } case ISD::FMAXNUM: case ISD::FMINNUM: + case ISD::FMAXNUM_IEEE: + case ISD::FMINNUM_IEEE: case AMDGPUISD::FMAX_LEGACY: case AMDGPUISD::FMIN_LEGACY: { // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y) @@ -3577,9 +3659,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, SDValue RHS = N0.getOperand(1); // 0 doesn't have a negated inline immediate. - // TODO: Shouldn't fold 1/2pi either, and should be generalized to other - // operations. - if (isConstantFPZero(RHS)) + // TODO: This constant check should be generalized to other operations. + if (isConstantCostlierToNegate(RHS)) return SDValue(); SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); @@ -3591,6 +3672,16 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); return Res; } + case AMDGPUISD::FMED3: { + SDValue Ops[3]; + for (unsigned I = 0; I < 3; ++I) + Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags()); + + SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags()); + if (!N0.hasOneUse()) + DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + return Res; + } case ISD::FP_EXTEND: case ISD::FTRUNC: case ISD::FRINT: @@ -3737,9 +3828,10 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, if (Src.getValueType() == MVT::i64) { SDLoc SL(N); uint64_t CVal = C->getZExtValue(); - return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT, - DAG.getConstant(Lo_32(CVal), SL, MVT::i32), - DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, + DAG.getConstant(Lo_32(CVal), SL, MVT::i32), + DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); + return DAG.getNode(ISD::BITCAST, SL, DestVT, BV); } } @@ -3786,9 +3878,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::MUL_U24: case AMDGPUISD::MULHI_I24: case AMDGPUISD::MULHI_U24: { - // If the first call to simplify is successfull, then N may end up being - // deleted, so we shouldn't call simplifyI24 again. - simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI); + if (SDValue V = simplifyI24(N, DCI)) + return V; return SDValue(); } case AMDGPUISD::MUL_LOHI_I24: @@ -3943,13 +4034,12 @@ SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG, SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, - SDValue StackPtr, SDValue ArgVal, int64_t Offset) const { MachineFunction &MF = DAG.getMachineFunction(); MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset); - SDValue Ptr = DAG.getObjectPtrOffset(SL, StackPtr, Offset); + SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32); SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4, MachineMemOperand::MODereferenceable); return Store; @@ -4111,6 +4201,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) + NODE_NAME_CASE(SBUFFER_LOAD) NODE_NAME_CASE(BUFFER_STORE) NODE_NAME_CASE(BUFFER_STORE_FORMAT) NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16) @@ -4210,33 +4301,42 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( } case AMDGPUISD::MUL_U24: case AMDGPUISD::MUL_I24: { - KnownBits LHSKnown, RHSKnown; - DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1); - DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1); - + KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); + KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); unsigned TrailZ = LHSKnown.countMinTrailingZeros() + RHSKnown.countMinTrailingZeros(); Known.Zero.setLowBits(std::min(TrailZ, 32u)); - unsigned LHSValBits = 32 - std::max(LHSKnown.countMinSignBits(), 8u); - unsigned RHSValBits = 32 - std::max(RHSKnown.countMinSignBits(), 8u); - unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u); - if (MaxValBits >= 32) - break; + // Truncate to 24 bits. + LHSKnown = LHSKnown.trunc(24); + RHSKnown = RHSKnown.trunc(24); + bool Negative = false; if (Opc == AMDGPUISD::MUL_I24) { - bool LHSNegative = !!(LHSKnown.One & (1 << 23)); - bool LHSPositive = !!(LHSKnown.Zero & (1 << 23)); - bool RHSNegative = !!(RHSKnown.One & (1 << 23)); - bool RHSPositive = !!(RHSKnown.Zero & (1 << 23)); + unsigned LHSValBits = 24 - LHSKnown.countMinSignBits(); + unsigned RHSValBits = 24 - RHSKnown.countMinSignBits(); + unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u); + if (MaxValBits >= 32) + break; + bool LHSNegative = LHSKnown.isNegative(); + bool LHSPositive = LHSKnown.isNonNegative(); + bool RHSNegative = RHSKnown.isNegative(); + bool RHSPositive = RHSKnown.isNonNegative(); if ((!LHSNegative && !LHSPositive) || (!RHSNegative && !RHSPositive)) break; Negative = (LHSNegative && RHSPositive) || (LHSPositive && RHSNegative); - } - if (Negative) - Known.One.setHighBits(32 - MaxValBits); - else + if (Negative) + Known.One.setHighBits(32 - MaxValBits); + else + Known.Zero.setHighBits(32 - MaxValBits); + } else { + unsigned LHSValBits = 24 - LHSKnown.countMinLeadingZeros(); + unsigned RHSValBits = 24 - RHSKnown.countMinLeadingZeros(); + unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u); + if (MaxValBits >= 32) + break; Known.Zero.setHighBits(32 - MaxValBits); + } break; } case AMDGPUISD::PERM: { @@ -4244,9 +4344,8 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( if (!CMask) return; - KnownBits LHSKnown, RHSKnown; - DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1); - DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1); + KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); + KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); unsigned Sel = CMask->getZExtValue(); for (unsigned I = 0; I < 32; I += 8) { @@ -4320,3 +4419,107 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( return 1; } } + +bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, + const SelectionDAG &DAG, + bool SNaN, + unsigned Depth) const { + unsigned Opcode = Op.getOpcode(); + switch (Opcode) { + case AMDGPUISD::FMIN_LEGACY: + case AMDGPUISD::FMAX_LEGACY: { + if (SNaN) + return true; + + // TODO: Can check no nans on one of the operands for each one, but which + // one? + return false; + } + case AMDGPUISD::FMUL_LEGACY: + case AMDGPUISD::CVT_PKRTZ_F16_F32: { + if (SNaN) + return true; + return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && + DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); + } + case AMDGPUISD::FMED3: + case AMDGPUISD::FMIN3: + case AMDGPUISD::FMAX3: + case AMDGPUISD::FMAD_FTZ: { + if (SNaN) + return true; + return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) && + DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && + DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1); + } + case AMDGPUISD::CVT_F32_UBYTE0: + case AMDGPUISD::CVT_F32_UBYTE1: + case AMDGPUISD::CVT_F32_UBYTE2: + case AMDGPUISD::CVT_F32_UBYTE3: + return true; + + case AMDGPUISD::RCP: + case AMDGPUISD::RSQ: + case AMDGPUISD::RCP_LEGACY: + case AMDGPUISD::RSQ_LEGACY: + case AMDGPUISD::RSQ_CLAMP: { + if (SNaN) + return true; + + // TODO: Need is known positive check. + return false; + } + case AMDGPUISD::LDEXP: + case AMDGPUISD::FRACT: { + if (SNaN) + return true; + return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); + } + case AMDGPUISD::DIV_SCALE: + case AMDGPUISD::DIV_FMAS: + case AMDGPUISD::DIV_FIXUP: + case AMDGPUISD::TRIG_PREOP: + // TODO: Refine on operands. + return SNaN; + case AMDGPUISD::SIN_HW: + case AMDGPUISD::COS_HW: { + // TODO: Need check for infinity + return SNaN; + } + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntrinsicID + = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + // TODO: Handle more intrinsics + switch (IntrinsicID) { + case Intrinsic::amdgcn_cubeid: + return true; + + case Intrinsic::amdgcn_frexp_mant: { + if (SNaN) + return true; + return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); + } + case Intrinsic::amdgcn_cvt_pkrtz: { + if (SNaN) + return true; + return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && + DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1); + } + case Intrinsic::amdgcn_fdot2: + // TODO: Refine on operand + return SNaN; + default: + return false; + } + } + default: + return false; + } +} + +TargetLowering::AtomicExpansionKind +AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { + if (RMW->getOperation() == AtomicRMWInst::Nand) + return AtomicExpansionKind::CmpXChg; + return AtomicExpansionKind::None; +} diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index a4c3b413e1037..0d22cb2e3e20b 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -41,8 +41,6 @@ public: static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG); protected: - AMDGPUAS AMDGPUASI; - SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; /// Split a vector store into multiple scalar stores. @@ -58,8 +56,9 @@ protected: SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFLOG(SDValue Op, SelectionDAG &Dag, + SDValue LowerFLOG(SDValue Op, SelectionDAG &DAG, double Log2BaseInverted) const; + SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const; @@ -95,6 +94,8 @@ protected: SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const; SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; + + bool isConstantCostlierToNegate(SDValue N) const; SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const; @@ -246,6 +247,11 @@ public: const SelectionDAG &DAG, unsigned Depth = 0) const override; + bool isKnownNeverNaNForTargetNode(SDValue Op, + const SelectionDAG &DAG, + bool SNaN = false, + unsigned Depth = 0) const override; + /// Helper function that adds Reg to the LiveIn list of the DAG's /// MachineFunction. /// @@ -279,7 +285,6 @@ public: SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, - SDValue StackPtr, SDValue ArgVal, int64_t Offset) const; @@ -299,13 +304,11 @@ public: uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const; - AMDGPUAS getAMDGPUAS() const { - return AMDGPUASI; - } - MVT getFenceOperandTy(const DataLayout &DL) const override { return MVT::i32; } + + AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; }; namespace AMDGPUISD { @@ -357,6 +360,7 @@ enum NodeType : unsigned { SIN_HW, FMAX_LEGACY, FMIN_LEGACY, + FMAX3, SMAX3, UMAX3, @@ -479,6 +483,7 @@ enum NodeType : unsigned { BUFFER_LOAD, BUFFER_LOAD_FORMAT, BUFFER_LOAD_FORMAT_D16, + SBUFFER_LOAD, BUFFER_STORE, BUFFER_STORE_FORMAT, BUFFER_STORE_FORMAT_D16, diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp index 35dd9eb0a478d..945c9acd379a5 100644 --- a/lib/Target/AMDGPU/AMDGPUInline.cpp +++ b/lib/Target/AMDGPU/AMDGPUInline.cpp @@ -44,7 +44,7 @@ ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200), cl::desc("Cost of alloca argument")); // If the amount of scratch memory to eliminate exceeds our ability to allocate -// it into registers we gain nothing by agressively inlining functions for that +// it into registers we gain nothing by aggressively inlining functions for that // heuristic. static cl::opt<unsigned> ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), @@ -118,8 +118,6 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const { if (!Callee) return (unsigned)Thres; - const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*Caller->getParent()); - // If we have a pointer to private array passed into a function // it will not be optimized out, leaving scratch usage. // Increase the inline threshold to allow inliniting in this case. @@ -128,7 +126,7 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const { for (Value *PtrArg : CS.args()) { Type *Ty = PtrArg->getType(); if (!Ty->isPointerTy() || - Ty->getPointerAddressSpace() != AS.PRIVATE_ADDRESS) + Ty->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) continue; PtrArg = GetUnderlyingObject(PtrArg, DL); if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) { @@ -174,18 +172,23 @@ InlineCost AMDGPUInliner::getInlineCost(CallSite CS) { Function *Caller = CS.getCaller(); TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); - if (!Callee || Callee->isDeclaration() || CS.isNoInline() || - !TTI.areInlineCompatible(Caller, Callee)) - return llvm::InlineCost::getNever(); + if (!Callee || Callee->isDeclaration()) + return llvm::InlineCost::getNever("undefined callee"); + + if (CS.isNoInline()) + return llvm::InlineCost::getNever("noinline"); + + if (!TTI.areInlineCompatible(Caller, Callee)) + return llvm::InlineCost::getNever("incompatible"); if (CS.hasFnAttr(Attribute::AlwaysInline)) { if (isInlineViable(*Callee)) - return llvm::InlineCost::getAlways(); - return llvm::InlineCost::getNever(); + return llvm::InlineCost::getAlways("alwaysinline viable"); + return llvm::InlineCost::getNever("alwaysinline unviable"); } if (isWrapperOnlyCall(CS)) - return llvm::InlineCost::getAlways(); + return llvm::InlineCost::getAlways("wrapper-only call"); InlineParams LocalParams = Params; LocalParams.DefaultThreshold = (int)getInlineThreshold(CS); diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 7442a59e594f1..82644be265638 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -62,18 +62,10 @@ def AMDGPULoopOp : SDTypeProfile<0, 2, [SDTCisVT<0, i64>, SDTCisVT<1, OtherVT>] >; -def AMDGPUBreakOp : SDTypeProfile<1, 1, - [SDTCisVT<0, i64>, SDTCisVT<1, i64>] ->; - def AMDGPUIfBreakOp : SDTypeProfile<1, 2, [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>] >; -def AMDGPUElseBreakOp : SDTypeProfile<1, 2, - [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, i64>] ->; - def AMDGPUAddeSubeOp : SDTypeProfile<2, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>] >; diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 219d430fbb395..8eb49d49b2e08 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -55,7 +55,6 @@ AMDGPUInstructionSelector::AMDGPUInstructionSelector( #define GET_GLOBALISEL_TEMPORARIES_INIT #include "AMDGPUGenGlobalISel.inc" #undef GET_GLOBALISEL_TEMPORARIES_INIT - ,AMDGPUASI(STI.getAMDGPUAS()) { } @@ -506,8 +505,8 @@ bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I, if (!I.hasOneMemOperand()) return false; - if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS && - (*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT) + if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS && + (*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS_32BIT) return false; if (!isInstrUniform(I)) @@ -631,6 +630,7 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I, return selectImpl(I, CoverageInfo); case TargetOpcode::G_ADD: return selectG_ADD(I); + case TargetOpcode::G_INTTOPTR: case TargetOpcode::G_BITCAST: return selectCOPY(I); case TargetOpcode::G_CONSTANT: diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 68b40b20aca24..449431adc561a 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -105,9 +105,6 @@ private: #define GET_GLOBALISEL_TEMPORARIES_DECL #include "AMDGPUGenGlobalISel.inc" #undef GET_GLOBALISEL_TEMPORARIES_DECL - -protected: - AMDGPUAS AMDGPUASI; }; } // End llvm namespace. diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index c9c932ef2f5fb..eb8f2002ff2dc 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -135,6 +135,12 @@ def brtarget : Operand<OtherVT>; // Misc. PatFrags //===----------------------------------------------------------------------===// +class HasOneUseUnaryOp<SDPatternOperator op> : PatFrag< + (ops node:$src0), + (op $src0), + [{ return N->hasOneUse(); }] +>; + class HasOneUseBinOp<SDPatternOperator op> : PatFrag< (ops node:$src0, node:$src1), (op $src0, $src1), @@ -152,13 +158,21 @@ def smax_oneuse : HasOneUseBinOp<smax>; def smin_oneuse : HasOneUseBinOp<smin>; def umax_oneuse : HasOneUseBinOp<umax>; def umin_oneuse : HasOneUseBinOp<umin>; + def fminnum_oneuse : HasOneUseBinOp<fminnum>; def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>; + +def fminnum_ieee_oneuse : HasOneUseBinOp<fminnum_ieee>; +def fmaxnum_ieee_oneuse : HasOneUseBinOp<fmaxnum_ieee>; + + def and_oneuse : HasOneUseBinOp<and>; def or_oneuse : HasOneUseBinOp<or>; def xor_oneuse : HasOneUseBinOp<xor>; } // Properties = [SDNPCommutative, SDNPAssociative] +def not_oneuse : HasOneUseUnaryOp<not>; + def add_oneuse : HasOneUseBinOp<add>; def sub_oneuse : HasOneUseBinOp<sub>; @@ -167,6 +181,9 @@ def shl_oneuse : HasOneUseBinOp<shl>; def select_oneuse : HasOneUseTernaryOp<select>; +def AMDGPUmul_u24_oneuse : HasOneUseBinOp<AMDGPUmul_u24>; +def AMDGPUmul_i24_oneuse : HasOneUseBinOp<AMDGPUmul_i24>; + def srl_16 : PatFrag< (ops node:$src0), (srl_oneuse node:$src0, (i32 16)) >; @@ -328,37 +345,37 @@ class StoreHi16<SDPatternOperator op> : PatFrag < >; class PrivateAddress : CodePatPred<[{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; }]>; class ConstantAddress : CodePatPred<[{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; }]>; class LocalAddress : CodePatPred<[{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; class GlobalAddress : CodePatPred<[{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }]>; class GlobalLoadAddress : CodePatPred<[{ auto AS = cast<MemSDNode>(N)->getAddressSpace(); - return AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.CONSTANT_ADDRESS; + return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS; }]>; class FlatLoadAddress : CodePatPred<[{ const auto AS = cast<MemSDNode>(N)->getAddressSpace(); - return AS == AMDGPUASI.FLAT_ADDRESS || - AS == AMDGPUASI.GLOBAL_ADDRESS || - AS == AMDGPUASI.CONSTANT_ADDRESS; + return AS == AMDGPUAS::FLAT_ADDRESS || + AS == AMDGPUAS::GLOBAL_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS; }]>; class FlatStoreAddress : CodePatPred<[{ const auto AS = cast<MemSDNode>(N)->getAddressSpace(); - return AS == AMDGPUASI.FLAT_ADDRESS || - AS == AMDGPUASI.GLOBAL_ADDRESS; + return AS == AMDGPUAS::FLAT_ADDRESS || + AS == AMDGPUAS::GLOBAL_ADDRESS; }]>; class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr), @@ -480,7 +497,7 @@ def az_extloadi16_constant : ConstantLoad <az_extloadi16>; class local_binary_atomic_op<SDNode atomic_op> : PatFrag<(ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), [{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; def atomic_swap_local : local_binary_atomic_op<atomic_swap>; @@ -497,14 +514,14 @@ def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>; def mskor_global : PatFrag<(ops node:$val, node:$ptr), (AMDGPUstore_mskor node:$val, node:$ptr), [{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS; + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }]>; class AtomicCmpSwapLocal <SDNode cmp_swap_node> : PatFrag< (ops node:$ptr, node:$cmp, node:$swap), (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ AtomicSDNode *AN = cast<AtomicSDNode>(N); - return AN->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; + return AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; def atomic_cmp_swap_local : AtomicCmpSwapLocal <atomic_cmp_swap>; @@ -513,17 +530,17 @@ multiclass global_binary_atomic_op<SDNode atomic_op> { def "" : PatFrag< (ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>; + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>; def _noret : PatFrag< (ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>; + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>; def _ret : PatFrag< (ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>; + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>; } defm atomic_swap_global : global_binary_atomic_op<atomic_swap>; @@ -550,12 +567,12 @@ def atomic_cmp_swap_global : PatFrag< def atomic_cmp_swap_global_noret : PatFrag< (ops node:$ptr, node:$cmp, node:$value), (atomic_cmp_swap node:$ptr, node:$cmp, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>; + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>; def atomic_cmp_swap_global_ret : PatFrag< (ops node:$ptr, node:$cmp, node:$value), (atomic_cmp_swap node:$ptr, node:$cmp, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>; + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>; //===----------------------------------------------------------------------===// // Misc Pattern Fragments @@ -787,18 +804,30 @@ class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat < (BIT_ALIGN $src0, $src0, $src1) >; -// This matches 16 permutations of -// max(min(x, y), min(max(x, y), z)) -class IntMed3Pat<Instruction med3Inst, +multiclass IntMed3Pat<Instruction med3Inst, + SDPatternOperator min, SDPatternOperator max, - SDPatternOperator max_oneuse, SDPatternOperator min_oneuse, - ValueType vt = i32> : AMDGPUPat< + SDPatternOperator max_oneuse, + ValueType vt = i32> { + + // This matches 16 permutations of + // min(max(a, b), max(min(a, b), c)) + def : AMDGPUPat < + (min (max_oneuse vt:$src0, vt:$src1), + (max_oneuse (min_oneuse vt:$src0, vt:$src1), vt:$src2)), + (med3Inst vt:$src0, vt:$src1, vt:$src2) +>; + + // This matches 16 permutations of + // max(min(x, y), min(max(x, y), z)) + def : AMDGPUPat < (max (min_oneuse vt:$src0, vt:$src1), (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)), (med3Inst $src0, $src1, $src2) >; - +} + // Special conversion patterns def cvt_rpi_i32_f32 : PatFrag < @@ -813,6 +842,7 @@ def cvt_flr_i32_f32 : PatFrag < [{ (void)N; return TM.Options.NoNaNsFPMath; }] >; +let AddedComplexity = 2 in { class IMad24Pat<Instruction Inst, bit HasClamp = 0> : AMDGPUPat < (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2), !if(HasClamp, (Inst $src0, $src1, $src2, (i1 0)), @@ -824,6 +854,7 @@ class UMad24Pat<Instruction Inst, bit HasClamp = 0> : AMDGPUPat < !if(HasClamp, (Inst $src0, $src1, $src2, (i1 0)), (Inst $src0, $src1, $src2)) >; +} // AddedComplexity. class RcpPat<Instruction RcpInst, ValueType vt> : AMDGPUPat < (fdiv FP_ONE, vt:$src), @@ -834,3 +865,25 @@ class RsqPat<Instruction RsqInst, ValueType vt> : AMDGPUPat < (AMDGPUrcp (fsqrt vt:$src)), (RsqInst $src) >; + +// Instructions which select to the same v_min_f* +def fminnum_like : PatFrags<(ops node:$src0, node:$src1), + [(fminnum_ieee node:$src0, node:$src1), + (fminnum node:$src0, node:$src1)] +>; + +// Instructions which select to the same v_max_f* +def fmaxnum_like : PatFrags<(ops node:$src0, node:$src1), + [(fmaxnum_ieee node:$src0, node:$src1), + (fmaxnum node:$src0, node:$src1)] +>; + +def fminnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1), + [(fminnum_ieee_oneuse node:$src0, node:$src1), + (fminnum_oneuse node:$src0, node:$src1)] +>; + +def fmaxnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1), + [(fmaxnum_ieee_oneuse node:$src0, node:$src1), + (fmaxnum_oneuse node:$src0, node:$src1)] +>; diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp index 896e2055cf620..02108ca3ddd78 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp @@ -40,7 +40,7 @@ StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID, if (IntrID < Intrinsic::num_intrinsics) return StringRef(); - assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics && + assert(IntrID < SIIntrinsic::num_AMDGPU_intrinsics && "Invalid intrinsic ID"); return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]; @@ -91,7 +91,7 @@ Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy)); AttributeList AS = - getAttributes(M->getContext(), static_cast<AMDGPUIntrinsic::ID>(IntrID)); + getAttributes(M->getContext(), static_cast<SIIntrinsic::ID>(IntrID)); F->setAttributes(AS); return F; } diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h index ef42f9a319af6..a1a094dded23d 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h +++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h @@ -20,7 +20,7 @@ namespace llvm { class TargetMachine; -namespace AMDGPUIntrinsic { +namespace SIIntrinsic { enum ID { last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1, #define GET_INTRINSIC_ENUM_VALUES diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/AMDGPU/AMDGPUIntrinsics.td deleted file mode 100644 index 230a046285047..0000000000000 --- a/lib/Target/AMDGPU/AMDGPUIntrinsics.td +++ /dev/null @@ -1,16 +0,0 @@ -//===-- AMDGPUIntrinsics.td - Common intrinsics -*- tablegen -*-----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines intrinsics that are used by all hw codegen targets. -// -//===----------------------------------------------------------------------===// - -let TargetPrefix = "AMDGPU", isTarget = 1 in { - def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; -} diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 87b072c9ea20a..ef85c1040545f 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -32,20 +32,52 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); }; - auto AMDGPUAS = ST.getAMDGPUAS(); - const LLT S1 = LLT::scalar(1); - const LLT V2S16 = LLT::vector(2, 16); - const LLT S32 = LLT::scalar(32); const LLT S64 = LLT::scalar(64); const LLT S512 = LLT::scalar(512); + const LLT V2S16 = LLT::vector(2, 16); + const LLT V4S16 = LLT::vector(4, 16); + const LLT V8S16 = LLT::vector(8, 16); + + const LLT V2S32 = LLT::vector(2, 32); + const LLT V3S32 = LLT::vector(3, 32); + const LLT V4S32 = LLT::vector(4, 32); + const LLT V5S32 = LLT::vector(5, 32); + const LLT V6S32 = LLT::vector(6, 32); + const LLT V7S32 = LLT::vector(7, 32); + const LLT V8S32 = LLT::vector(8, 32); + const LLT V9S32 = LLT::vector(9, 32); + const LLT V10S32 = LLT::vector(10, 32); + const LLT V11S32 = LLT::vector(11, 32); + const LLT V12S32 = LLT::vector(12, 32); + const LLT V13S32 = LLT::vector(13, 32); + const LLT V14S32 = LLT::vector(14, 32); + const LLT V15S32 = LLT::vector(15, 32); + const LLT V16S32 = LLT::vector(16, 32); + + const LLT V2S64 = LLT::vector(2, 64); + const LLT V3S64 = LLT::vector(3, 64); + const LLT V4S64 = LLT::vector(4, 64); + const LLT V5S64 = LLT::vector(5, 64); + const LLT V6S64 = LLT::vector(6, 64); + const LLT V7S64 = LLT::vector(7, 64); + const LLT V8S64 = LLT::vector(8, 64); + + std::initializer_list<LLT> AllS32Vectors = + {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, + V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32}; + std::initializer_list<LLT> AllS64Vectors = + {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64}; + const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); - const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS.FLAT_ADDRESS); - const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS.PRIVATE_ADDRESS); + const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); + const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); + + const LLT CodePtr = FlatPtr; const LLT AddrSpaces[] = { GlobalPtr, @@ -55,13 +87,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, PrivatePtr }; + setAction({G_BRCOND, S1}, Legal); + setAction({G_ADD, S32}, Legal); setAction({G_ASHR, S32}, Legal); setAction({G_SUB, S32}, Legal); setAction({G_MUL, S32}, Legal); - setAction({G_AND, S32}, Legal); - setAction({G_OR, S32}, Legal); - setAction({G_XOR, S32}, Legal); + + // FIXME: 64-bit ones only legal for scalar + getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) + .legalFor({S32, S1, S64, V2S32}); + + getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO, + G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) + .legalFor({{S32, S1}}); setAction({G_BITCAST, V2S16}, Legal); setAction({G_BITCAST, 1, S32}, Legal); @@ -90,35 +129,80 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, // between these two scenarios. setAction({G_CONSTANT, S1}, Legal); - setAction({G_FADD, S32}, Legal); + setAction({G_FRAME_INDEX, PrivatePtr}, Legal); + + getActionDefinitionsBuilder( + { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA}) + .legalFor({S32, S64}); + + getActionDefinitionsBuilder(G_FPTRUNC) + .legalFor({{S32, S64}}); + + // Use actual fsub instruction + setAction({G_FSUB, S32}, Legal); + + // Must use fadd + fneg + setAction({G_FSUB, S64}, Lower); setAction({G_FCMP, S1}, Legal); setAction({G_FCMP, 1, S32}, Legal); setAction({G_FCMP, 1, S64}, Legal); - setAction({G_FMUL, S32}, Legal); - setAction({G_ZEXT, S64}, Legal); setAction({G_ZEXT, 1, S32}, Legal); + setAction({G_SEXT, S64}, Legal); + setAction({G_SEXT, 1, S32}, Legal); + + setAction({G_ANYEXT, S64}, Legal); + setAction({G_ANYEXT, 1, S32}, Legal); + setAction({G_FPTOSI, S32}, Legal); setAction({G_FPTOSI, 1, S32}, Legal); setAction({G_SITOFP, S32}, Legal); setAction({G_SITOFP, 1, S32}, Legal); + setAction({G_UITOFP, S32}, Legal); + setAction({G_UITOFP, 1, S32}, Legal); + setAction({G_FPTOUI, S32}, Legal); setAction({G_FPTOUI, 1, S32}, Legal); + setAction({G_FPOW, S32}, Legal); + setAction({G_FEXP2, S32}, Legal); + setAction({G_FLOG2, S32}, Legal); + + getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND}) + .legalFor({S32, S64}); + for (LLT PtrTy : AddrSpaces) { LLT IdxTy = LLT::scalar(PtrTy.getSizeInBits()); setAction({G_GEP, PtrTy}, Legal); setAction({G_GEP, 1, IdxTy}, Legal); } + setAction({G_BLOCK_ADDR, CodePtr}, Legal); + setAction({G_ICMP, S1}, Legal); setAction({G_ICMP, 1, S32}, Legal); + setAction({G_CTLZ, S32}, Legal); + setAction({G_CTLZ_ZERO_UNDEF, S32}, Legal); + setAction({G_CTTZ, S32}, Legal); + setAction({G_CTTZ_ZERO_UNDEF, S32}, Legal); + setAction({G_BSWAP, S32}, Legal); + setAction({G_CTPOP, S32}, Legal); + + getActionDefinitionsBuilder(G_INTTOPTR) + .legalIf([](const LegalityQuery &Query) { + return true; + }); + + getActionDefinitionsBuilder(G_PTRTOINT) + .legalIf([](const LegalityQuery &Query) { + return true; + }); getActionDefinitionsBuilder({G_LOAD, G_STORE}) .legalIf([=, &ST](const LegalityQuery &Query) { @@ -145,6 +229,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, }); + auto &Atomics = getActionDefinitionsBuilder( + {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, + G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, + G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, + G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) + .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, + {S64, GlobalPtr}, {S64, LocalPtr}}); + if (ST.hasFlatAddressSpace()) { + Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); + } setAction({G_SELECT, S32}, Legal); setAction({G_SELECT, 1, S1}, Legal); @@ -180,6 +274,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST, (Ty1.getSizeInBits() % 32 == 0); }); + getActionDefinitionsBuilder(G_BUILD_VECTOR) + .legalForCartesianProduct(AllS32Vectors, {S32}) + .legalForCartesianProduct(AllS64Vectors, {S64}) + .clampNumElements(0, V16S32, V16S32) + .clampNumElements(0, V2S64, V8S64) + .minScalarSameAs(1, 0); + + // TODO: Support any combination of v2s32 + getActionDefinitionsBuilder(G_CONCAT_VECTORS) + .legalFor({{V4S32, V2S32}, + {V8S32, V2S32}, + {V8S32, V4S32}, + {V4S64, V2S64}, + {V4S16, V2S16}, + {V8S16, V2S16}, + {V8S16, V4S16}}); + // Merge/Unmerge for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 7a7ed7a4f0656..14e8800426911 100644 --- a/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -1333,8 +1333,7 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B, // for OpenCL 2.0 we have only generic implementation of sincos // function. AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo); - const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*M); - nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AS.FLAT_ADDRESS); + nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS); Function *Fsincos = dyn_cast_or_null<Function>(getFunction(M, nf)); if (!Fsincos) return false; @@ -1347,7 +1346,7 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B, // The allocaInst allocates the memory in private address space. This need // to be bitcasted to point to the address space of cos pointer type. // In OpenCL 2.0 this is generic, while in 1.2 that is private. - if (PTy->getPointerAddressSpace() != AS.PRIVATE_ADDRESS) + if (PTy->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) P = B.CreateAddrSpaceCast(Alloc, PTy); CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P); diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/lib/Target/AMDGPU/AMDGPULibFunc.cpp index 4671273d61f91..4fc3fe0f105b0 100644 --- a/lib/Target/AMDGPU/AMDGPULibFunc.cpp +++ b/lib/Target/AMDGPU/AMDGPULibFunc.cpp @@ -90,7 +90,6 @@ class UnmangledFuncInfo { public: using ID = AMDGPULibFunc::EFuncId; - UnmangledFuncInfo() = default; UnmangledFuncInfo(StringRef _Name, unsigned _NumArgs) : Name(_Name), NumArgs(_NumArgs) {} // Get index to Table by function name. @@ -996,8 +995,10 @@ Function *AMDGPULibFunc::getOrInsertFunction(Module *M, } else { AttributeList Attr; LLVMContext &Ctx = M->getContext(); - Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::ReadOnly); - Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::NoUnwind); + Attr = Attr.addAttribute(Ctx, AttributeList::FunctionIndex, + Attribute::ReadOnly); + Attr = Attr.addAttribute(Ctx, AttributeList::FunctionIndex, + Attribute::NoUnwind); C = M->getOrInsertFunction(FuncName, FuncTy, Attr); } diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index c147830e12ed6..743dc7a0d00b9 100644 --- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -16,7 +16,6 @@ #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -84,8 +83,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { return false; CallInst *KernArgSegment = - Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, nullptr, - F.getName() + ".kernarg.segment"); + Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {}, + nullptr, F.getName() + ".kernarg.segment"); KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); KernArgSegment->addAttribute(AttributeList::ReturnIndex, @@ -123,14 +122,17 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { VectorType *VT = dyn_cast<VectorType>(ArgTy); bool IsV3 = VT && VT->getNumElements() == 3; + bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType(); + VectorType *V4Ty = nullptr; int64_t AlignDownOffset = alignDown(EltOffset, 4); int64_t OffsetDiff = EltOffset - AlignDownOffset; - unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset); + unsigned AdjustedAlign = MinAlign(DoShiftOpt ? AlignDownOffset : EltOffset, + KernArgBaseAlign); Value *ArgPtr; - if (Size < 32 && !ArgTy->isAggregateType()) { // FIXME: Handle aggregate types + if (DoShiftOpt) { // FIXME: Handle aggregate types // Since we don't have sub-dword scalar loads, avoid doing an extload by // loading earlier than the argument address, and extracting the relevant // bits. @@ -148,7 +150,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { } else { ArgPtr = Builder.CreateConstInBoundsGEP1_64( KernArgSegment, - AlignDownOffset, + EltOffset, Arg.getName() + ".kernarg.offset"); ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS), ArgPtr->getName() + ".cast"); @@ -199,7 +201,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { // TODO: Convert noalias arg to !noalias - if (Size < 32 && !ArgTy->isAggregateType()) { + if (DoShiftOpt) { Value *ExtractBits = OffsetDiff == 0 ? Load : Builder.CreateLShr(Load, OffsetDiff * 8); diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 1876dc3f71221..f6bdbf5e9be2c 100644 --- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -301,6 +301,26 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { MCInstLowering.lower(MI, TmpInst); EmitToStreamer(*OutStreamer, TmpInst); +#ifdef EXPENSIVE_CHECKS + // Sanity-check getInstSizeInBytes on explicitly specified CPUs (it cannot + // work correctly for the generic CPU). + // + // The isPseudo check really shouldn't be here, but unfortunately there are + // some negative lit tests that depend on being able to continue through + // here even when pseudo instructions haven't been lowered. + if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU())) { + SmallVector<MCFixup, 4> Fixups; + SmallVector<char, 16> CodeBytes; + raw_svector_ostream CodeStream(CodeBytes); + + std::unique_ptr<MCCodeEmitter> InstEmitter(createSIMCCodeEmitter( + *STI.getInstrInfo(), *OutContext.getRegisterInfo(), OutContext)); + InstEmitter->encodeInstruction(TmpInst, CodeStream, Fixups, STI); + + assert(CodeBytes.size() == STI.getInstrInfo()->getInstSizeInBytes(*MI)); + } +#endif + if (STI.dumpCode()) { // Disassemble instruction/operands to text. DisasmLines.resize(DisasmLines.size() + 1); diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp index 995d9ae3907fc..5e0b7d4290220 100644 --- a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp +++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp @@ -42,9 +42,12 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_, if (!FirstMI) return true; + const MachineBasicBlock &MBB = *FirstMI->getParent(); + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); const MachineOperand *Src2 = TII.getNamedOperand(SecondMI, AMDGPU::OpName::src2); - return FirstMI->definesRegister(Src2->getReg()); + return FirstMI->definesRegister(Src2->getReg(), TRI); } default: return false; diff --git a/lib/Target/AMDGPU/AMDGPUPTNote.h b/lib/Target/AMDGPU/AMDGPUPTNote.h index b50a2eb8e9e71..2feff14d34a15 100644 --- a/lib/Target/AMDGPU/AMDGPUPTNote.h +++ b/lib/Target/AMDGPU/AMDGPUPTNote.h @@ -23,7 +23,8 @@ namespace ElfNote { const char SectionName[] = ".note"; -const char NoteName[] = "AMD"; +const char NoteNameV2[] = "AMD"; +const char NoteNameV3[] = "AMDGPU"; // TODO: Remove this file once we drop code object v2. enum NoteType{ diff --git a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp index 3cfdccc9fe51a..e53a8fe7c074d 100644 --- a/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ b/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -99,8 +99,6 @@ private: const DataLayout *DL; - AMDGPUAS AS; - const TargetLowering *TLI; void visit(const Function &F); @@ -267,7 +265,6 @@ void AMDGPUPerfHint::runOnFunction(Function &F) { const Module &M = *F.getParent(); DL = &M.getDataLayout(); - AS = AMDGPU::getAMDGPUAS(M); visit(F); auto Loc = FIM.find(&F); @@ -306,14 +303,14 @@ bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const { if (auto PT = dyn_cast<PointerType>(V->getType())) { unsigned As = PT->getAddressSpace(); // Flat likely points to global too. - return As == AS.GLOBAL_ADDRESS || As == AS.FLAT_ADDRESS; + return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS; } return false; } bool AMDGPUPerfHint::isLocalAddr(const Value *V) const { if (auto PT = dyn_cast<PointerType>(V->getType())) - return PT->getAddressSpace() == AS.LOCAL_ADDRESS; + return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; return false; } @@ -346,7 +343,8 @@ AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const { bool AMDGPUPerfHint::isConstantAddr(const Value *V) const { if (auto PT = dyn_cast<PointerType>(V->getType())) { unsigned As = PT->getAddressSpace(); - return As == AS.CONSTANT_ADDRESS || As == AS.CONSTANT_ADDRESS_32BIT; + return As == AMDGPUAS::CONSTANT_ADDRESS || + As == AMDGPUAS::CONSTANT_ADDRESS_32BIT; } return false; } diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index d341fec6296fb..5d087c0991844 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -70,13 +70,17 @@ static cl::opt<bool> DisablePromoteAllocaToVector( cl::desc("Disable promote alloca to vector"), cl::init(false)); +static cl::opt<bool> DisablePromoteAllocaToLDS( + "disable-promote-alloca-to-lds", + cl::desc("Disable promote alloca to LDS"), + cl::init(false)); + // FIXME: This can create globals so should be a module pass. class AMDGPUPromoteAlloca : public FunctionPass { private: const TargetMachine *TM; Module *Mod = nullptr; const DataLayout *DL = nullptr; - AMDGPUAS AS; // FIXME: This should be per-kernel. uint32_t LocalMemLimit = 0; @@ -156,8 +160,6 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { if (!ST.isPromoteAllocaEnabled()) return false; - AS = AMDGPU::getAMDGPUAS(*F.getParent()); - bool SufficientLDS = hasSufficientLocalMem(F); bool Changed = false; BasicBlock &EntryBB = *F.begin(); @@ -238,7 +240,7 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { Type *I32Ty = Type::getInt32Ty(Mod->getContext()); Value *CastDispatchPtr = Builder.CreateBitCast( - DispatchPtr, PointerType::get(I32Ty, AS.CONSTANT_ADDRESS)); + DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS)); // We could do a single 64-bit load here, but it's likely that the basic // 32-bit and extract sequence is already present, and it is probably easier @@ -326,6 +328,10 @@ static bool canVectorizeInst(Instruction *Inst, User *User) { // Currently only handle the case where the Pointer Operand is a GEP. // Also we could not vectorize volatile or atomic loads. LoadInst *LI = cast<LoadInst>(Inst); + if (isa<AllocaInst>(User) && + LI->getPointerOperandType() == User->getType() && + isa<VectorType>(LI->getType())) + return true; return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple(); } case Instruction::BitCast: @@ -335,6 +341,10 @@ static bool canVectorizeInst(Instruction *Inst, User *User) { // since it should be canonical form, the User should be a GEP. // Also we could not vectorize volatile or atomic stores. StoreInst *SI = cast<StoreInst>(Inst); + if (isa<AllocaInst>(User) && + SI->getPointerOperandType() == User->getType() && + isa<VectorType>(SI->getValueOperand()->getType())) + return true; return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple(); } default: @@ -342,14 +352,15 @@ static bool canVectorizeInst(Instruction *Inst, User *User) { } } -static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) { +static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { if (DisablePromoteAllocaToVector) { LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n"); return false; } - ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType()); + Type *AT = Alloca->getAllocatedType(); + SequentialType *AllocaTy = dyn_cast<SequentialType>(AT); LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n"); @@ -396,7 +407,9 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) { } } - VectorType *VectorTy = arrayTypeToVecType(AllocaTy); + VectorType *VectorTy = dyn_cast<VectorType>(AllocaTy); + if (!VectorTy) + VectorTy = arrayTypeToVecType(cast<ArrayType>(AllocaTy)); LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " << *VectorTy << '\n'); @@ -406,7 +419,10 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) { IRBuilder<> Builder(Inst); switch (Inst->getOpcode()) { case Instruction::Load: { - Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS); + if (Inst->getType() == AT) + break; + + Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand(); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); @@ -418,9 +434,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) { break; } case Instruction::Store: { - Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS); - StoreInst *SI = cast<StoreInst>(Inst); + if (SI->getValueOperand()->getType() == AT) + break; + + Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); Value *Ptr = SI->getPointerOperand(); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); @@ -610,7 +628,7 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) { // we cannot use local memory in the pass. for (Type *ParamTy : FTy->params()) { PointerType *PtrTy = dyn_cast<PointerType>(ParamTy); - if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) { + if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { LocalMemLimit = 0; LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to " "local memory disabled.\n"); @@ -627,7 +645,7 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) { // Check how much local memory is being used by global objects CurrentLocalMemUsage = 0; for (GlobalVariable &GV : Mod->globals()) { - if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS) + if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) continue; for (const User *U : GV.users()) { @@ -706,9 +724,12 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n'); - if (tryPromoteAllocaToVector(&I, AS)) + if (tryPromoteAllocaToVector(&I)) return true; // Promoted to vector. + if (DisablePromoteAllocaToLDS) + return false; + const Function &ContainingFunction = *I.getParent()->getParent(); CallingConv::ID CC = ContainingFunction.getCallingConv(); @@ -775,7 +796,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { Twine(F->getName()) + Twine('.') + I.getName(), nullptr, GlobalVariable::NotThreadLocal, - AS.LOCAL_ADDRESS); + AMDGPUAS::LOCAL_ADDRESS); GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); GV->setAlignment(I.getAlignment()); @@ -808,7 +829,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) { Value *Src0 = CI->getOperand(0); Type *EltTy = Src0->getType()->getPointerElementType(); - PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS); + PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); if (isa<ConstantPointerNull>(CI->getOperand(0))) CI->setOperand(0, ConstantPointerNull::get(NewTy)); @@ -825,7 +846,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { continue; Type *EltTy = V->getType()->getPointerElementType(); - PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS); + PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); // FIXME: It doesn't really make sense to try to do this for all // instructions. @@ -894,7 +915,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { Type *SrcTy = Src->getType()->getPointerElementType(); Function *ObjectSize = Intrinsic::getDeclaration(Mod, Intrinsic::objectsize, - { Intr->getType(), PointerType::get(SrcTy, AS.LOCAL_ADDRESS) } + { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) } ); CallInst *NewCall = Builder.CreateCall( diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 012e4fe200aae..7a760dcf7a908 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -35,7 +35,7 @@ AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI) : AMDGPUGenRegisterBankInfo(), TRI(static_cast<const SIRegisterInfo*>(&TRI)) { - // HACK: Until this is fully tablegen'd + // HACK: Until this is fully tablegen'd. static bool AlreadyInit = false; if (AlreadyInit) return; @@ -74,13 +74,16 @@ unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, const RegisterBank &Src, unsigned Size) const { if (Dst.getID() == AMDGPU::SGPRRegBankID && - Src.getID() == AMDGPU::VGPRRegBankID) + Src.getID() == AMDGPU::VGPRRegBankID) { return std::numeric_limits<unsigned>::max(); + } // SGPRRegBank with size 1 is actually vcc or another 64-bit sgpr written by // the valu. if (Size == 1 && Dst.getID() == AMDGPU::SCCRegBankID && - Src.getID() == AMDGPU::SGPRRegBankID) + (Src.getID() == AMDGPU::SGPRRegBankID || + Src.getID() == AMDGPU::VGPRRegBankID || + Src.getID() == AMDGPU::VCCRegBankID)) return std::numeric_limits<unsigned>::max(); return RegisterBankInfo::copyCost(Dst, Src, Size); @@ -145,7 +148,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( AltMappings.push_back(&SSMapping); const InstructionMapping &SVMapping = getInstructionMapping(2, 1, - getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), + getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr, // Predicate operand. AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}), @@ -153,7 +156,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( AltMappings.push_back(&SVMapping); const InstructionMapping &VSMapping = getInstructionMapping(3, 1, - getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), + getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr, // Predicate operand. AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), @@ -161,7 +164,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( AltMappings.push_back(&VSMapping); const InstructionMapping &VVMapping = getInstructionMapping(4, 1, - getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), + getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr, // Predicate operand. AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}), @@ -170,6 +173,67 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings( return AltMappings; } + case TargetOpcode::G_SELECT: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + const InstructionMapping &SSMapping = getInstructionMapping(1, 1, + getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), + 4); // Num Operands + AltMappings.push_back(&SSMapping); + + const InstructionMapping &VVMapping = getInstructionMapping(2, 1, + getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}), + 4); // Num Operands + AltMappings.push_back(&VVMapping); + + return AltMappings; + } + case TargetOpcode::G_UADDE: + case TargetOpcode::G_USUBE: + case TargetOpcode::G_SADDE: + case TargetOpcode::G_SSUBE: { + unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + const InstructionMapping &SSMapping = getInstructionMapping(1, 1, + getOperandsMapping( + {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1)}), + 5); // Num Operands + AltMappings.push_back(&SSMapping); + + const InstructionMapping &VVMapping = getInstructionMapping(2, 1, + getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), + AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), + 5); // Num Operands + AltMappings.push_back(&VVMapping); + return AltMappings; + } + case AMDGPU::G_BRCOND: { + assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); + + const InstructionMapping &SMapping = getInstructionMapping( + 1, 1, getOperandsMapping( + {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), nullptr}), + 2); // Num Operands + AltMappings.push_back(&SMapping); + + const InstructionMapping &VMapping = getInstructionMapping( + 1, 1, getOperandsMapping( + {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), + 2); // Num Operands + AltMappings.push_back(&VMapping); + return AltMappings; + } default: break; } @@ -193,10 +257,16 @@ bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) { + if (!MI.getOperand(i).isReg()) + continue; unsigned Reg = MI.getOperand(i).getReg(); - const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); - if (Bank && Bank->getID() != AMDGPU::SGPRRegBankID) - return false; + if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { + if (Bank->getID() == AMDGPU::VGPRRegBankID) + return false; + + assert(Bank->getID() == AMDGPU::SGPRRegBankID || + Bank->getID() == AMDGPU::SCCRegBankID); + } } return true; } @@ -209,7 +279,8 @@ AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); - OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + unsigned BankID = Size == 1 ? AMDGPU::SCCRegBankID : AMDGPU::SGPRRegBankID; + OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); } return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); @@ -230,12 +301,32 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { unsigned Reg1 = MI.getOperand(OpdIdx).getReg(); unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI); - unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI); + + unsigned DefaultBankID = Size1 == 1 ? + AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; + unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID); + OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1); for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) { unsigned Size = getSizeInBits(MI.getOperand(OpdIdx).getReg(), MRI, *TRI); - OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); + unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; + OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size); + } + + return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), + MI.getNumOperands()); +} + +const RegisterBankInfo::InstructionMapping & +AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); + + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { + unsigned Size = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); + OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); } return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), @@ -304,21 +395,49 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { switch (MI.getOpcode()) { default: return getInvalidInstructionMapping(); + + case AMDGPU::G_AND: + case AMDGPU::G_OR: + case AMDGPU::G_XOR: { + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + if (Size == 1) { + OpdsMapping[0] = OpdsMapping[1] = + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + break; + } + + LLVM_FALLTHROUGH; + } + case AMDGPU::G_ADD: case AMDGPU::G_SUB: case AMDGPU::G_MUL: - case AMDGPU::G_AND: - case AMDGPU::G_OR: - case AMDGPU::G_XOR: case AMDGPU::G_SHL: + case AMDGPU::G_UADDO: + case AMDGPU::G_SADDO: + case AMDGPU::G_USUBO: + case AMDGPU::G_SSUBO: + case AMDGPU::G_UADDE: + case AMDGPU::G_SADDE: + case AMDGPU::G_USUBE: + case AMDGPU::G_SSUBE: if (isSALUMapping(MI)) return getDefaultMappingSOP(MI); - // Fall-through + LLVM_FALLTHROUGH; case AMDGPU::G_FADD: + case AMDGPU::G_FSUB: case AMDGPU::G_FPTOSI: case AMDGPU::G_FPTOUI: case AMDGPU::G_FMUL: + case AMDGPU::G_FMA: + case AMDGPU::G_SITOFP: + case AMDGPU::G_UITOFP: + case AMDGPU::G_FPTRUNC: + case AMDGPU::G_FEXP2: + case AMDGPU::G_FLOG2: + case AMDGPU::G_INTRINSIC_TRUNC: + case AMDGPU::G_INTRINSIC_ROUND: return getDefaultMappingVOP(MI); case AMDGPU::G_IMPLICIT_DEF: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); @@ -326,11 +445,25 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case AMDGPU::G_FCONSTANT: - case AMDGPU::G_CONSTANT: { + case AMDGPU::G_CONSTANT: + case AMDGPU::G_FRAME_INDEX: + case AMDGPU::G_BLOCK_ADDR: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } + case AMDGPU::G_INSERT: { + unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : + AMDGPU::VGPRRegBankID; + unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); + unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); + unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); + OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); + OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); + OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); + OpdsMapping[3] = nullptr; + break; + } case AMDGPU::G_EXTRACT: { unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); @@ -352,7 +485,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); break; } - case AMDGPU::G_BITCAST: { + case AMDGPU::G_BITCAST: + case AMDGPU::G_INTTOPTR: + case AMDGPU::G_PTRTOINT: + case AMDGPU::G_CTLZ: + case AMDGPU::G_CTLZ_ZERO_UNDEF: + case AMDGPU::G_CTTZ: + case AMDGPU::G_CTTZ_ZERO_UNDEF: + case AMDGPU::G_CTPOP: + case AMDGPU::G_BSWAP: + case AMDGPU::G_FABS: + case AMDGPU::G_FNEG: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); @@ -368,7 +511,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); break; } - case AMDGPU::G_ZEXT: { + case AMDGPU::G_ZEXT: + case AMDGPU::G_SEXT: + case AMDGPU::G_ANYEXT: { unsigned Dst = MI.getOperand(0).getReg(); unsigned Src = MI.getOperand(1).getReg(); unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); @@ -391,7 +536,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_FCMP: { unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); - OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 1); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); OpdsMapping[1] = nullptr; // Predicate Operand. OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); @@ -431,7 +576,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); unsigned Op0Bank = Op2Bank == AMDGPU::SGPRRegBankID && Op3Bank == AMDGPU::SGPRRegBankID ? - AMDGPU::SCCRegBankID : AMDGPU::VGPRRegBankID; + AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1); OpdsMapping[1] = nullptr; // Predicate Operand. OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); @@ -479,6 +624,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } + case AMDGPU::G_UNMERGE_VALUES: { + unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : + AMDGPU::VGPRRegBankID; + + // Op1 and Dst should use the same register bank. + // FIXME: Shouldn't this be the default? Why do we need to handle this? + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); + OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); + } + break; + } case AMDGPU::G_INTRINSIC: { switch (MI.getOperand(1).getIntrinsicID()) { default: @@ -492,6 +649,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } + case Intrinsic::amdgcn_wqm_vote: { + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + OpdsMapping[0] = OpdsMapping[2] + = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); + break; + } } break; } @@ -528,8 +691,50 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } break; } + case AMDGPU::G_SELECT: { + unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned Op1Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); + unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); + bool SGPRSrcs = Op1Bank == AMDGPU::SCCRegBankID && + Op2Bank == AMDGPU::SGPRRegBankID && + Op3Bank == AMDGPU::SGPRRegBankID; + unsigned Bank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; + Op1Bank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; + OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); + OpdsMapping[1] = AMDGPU::getValueMapping(Op1Bank, 1); + OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); + OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); + break; + } + case AMDGPU::G_LOAD: return getInstrMappingForLoad(MI); + + case AMDGPU::G_ATOMICRMW_XCHG: + case AMDGPU::G_ATOMICRMW_ADD: + case AMDGPU::G_ATOMICRMW_SUB: + case AMDGPU::G_ATOMICRMW_AND: + case AMDGPU::G_ATOMICRMW_OR: + case AMDGPU::G_ATOMICRMW_XOR: + case AMDGPU::G_ATOMICRMW_MAX: + case AMDGPU::G_ATOMICRMW_MIN: + case AMDGPU::G_ATOMICRMW_UMAX: + case AMDGPU::G_ATOMICRMW_UMIN: + case AMDGPU::G_ATOMIC_CMPXCHG: { + return getDefaultMappingAllVGPR(MI); + } + case AMDGPU::G_BRCOND: { + unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI, + AMDGPU::SGPRRegBankID); + assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); + if (Bank != AMDGPU::SCCRegBankID) + Bank = AMDGPU::VCCRegBankID; + + OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); + break; + } } return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index d48a665898735..d29f4bc79a519 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -49,6 +49,8 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { bool isSALUMapping(const MachineInstr &MI) const; const InstructionMapping &getDefaultMappingSOP(const MachineInstr &MI) const; const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const; + const InstructionMapping &getDefaultMappingAllVGPR( + const MachineInstr &MI) const; public: AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI); diff --git a/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/lib/Target/AMDGPU/AMDGPURegisterBanks.td index 7f7f75f656479..570379a820e12 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBanks.td +++ b/lib/Target/AMDGPU/AMDGPURegisterBanks.td @@ -15,4 +15,7 @@ def VGPRRegBank : RegisterBank<"VGPR", [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512] >; -def SCCRegBank : RegisterBank <"SCC", [SCC_CLASS ]>; +def SCCRegBank : RegisterBank <"SCC", [SCC_CLASS]>; + +// It is helpful to distinguish conditions from ordinary SGPRs. +def VCCRegBank : RegisterBank <"VCC", [SReg_64]>; diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h index 07de5fc549e29..922d974f2ebd6 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -27,8 +27,6 @@ class TargetInstrInfo; struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { AMDGPURegisterInfo(); - bool enableMultipleCopyHints() const override { return true; } - /// \returns the sub reg enum value for the given \p Channel /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) static unsigned getSubRegFromChannel(unsigned Channel); diff --git a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp index a861762a8c9e3..efe501cb73c27 100644 --- a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp +++ b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp @@ -163,7 +163,7 @@ bool AMDGPURewriteOutArguments::checkArgumentUses(Value &Arg) const { // some casts between structs and non-structs, but we can't bitcast // directly between them. directly bitcast between them. Blender uses // some casts that look like { <3 x float> }* to <4 x float>* - if ((SrcEltTy->isStructTy() && (SrcEltTy->getNumContainedTypes() != 1))) + if ((SrcEltTy->isStructTy() && (SrcEltTy->getStructNumElements() != 1))) return false; // Clang emits OpenCL 3-vector type accesses with a bitcast to the @@ -401,8 +401,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { if (Val->getType() != EltTy) { Type *EffectiveEltTy = EltTy; if (StructType *CT = dyn_cast<StructType>(EltTy)) { - assert(CT->getNumContainedTypes() == 1); - EffectiveEltTy = CT->getContainedType(0); + assert(CT->getNumElements() == 1); + EffectiveEltTy = CT->getElementType(0); } if (DL->getTypeSizeInBits(EffectiveEltTy) != diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 98b49070fa99f..ed0cc70c3d9aa 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -74,6 +74,9 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, // We want to be able to turn these off, but making this a subtarget feature // for SI has the unhelpful behavior that it unsets everything else if you // disable it. + // + // Similarly we want enable-prt-strict-null to be on by default and not to + // unset everything else if it is disabled SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); @@ -89,6 +92,8 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, FullFS += "-fp32-denormals,"; } + FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS + FullFS += FS; ParseSubtargetFeatures(GPU, FullFS); @@ -124,10 +129,8 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, return *this; } -AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, - const FeatureBitset &FeatureBits) : +AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT), - SubtargetFeatureBits(FeatureBits), Has16BitInsts(false), HasMadMixInsts(false), FP32Denormals(false), @@ -136,19 +139,22 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, HasVOP3PInsts(false), HasMulI24(true), HasMulU24(true), + HasInv2PiInlineImm(false), HasFminFmaxLegacy(true), EnablePromoteAlloca(false), + HasTrigReducedRange(false), LocalMemorySize(0), WavefrontSize(0) { } GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, - const GCNTargetMachine &TM) : + const GCNTargetMachine &TM) : AMDGPUGenSubtargetInfo(TT, GPU, FS), - AMDGPUSubtarget(TT, getFeatureBits()), + AMDGPUSubtarget(TT), TargetTriple(TT), Gen(SOUTHERN_ISLANDS), IsaVersion(ISAVersion0_0_0), + InstrItins(getInstrItineraryForCPU(GPU)), LDSBankCount(0), MaxPrivateElementSize(0), @@ -170,16 +176,17 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, DebuggerEmitPrologue(false), EnableHugePrivateBuffer(false), - EnableVGPRSpilling(false), EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), EnableSIScheduler(false), EnableDS128(false), + EnablePRTStrictNull(false), DumpCode(false), FP64(false), GCN3Encoding(false), CIInsts(false), + VIInsts(false), GFX9Insts(false), SGPRInitBug(false), HasSMemRealTime(false), @@ -189,15 +196,16 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasVGPRIndexMode(false), HasScalarStores(false), HasScalarAtomics(false), - HasInv2PiInlineImm(false), HasSDWAOmod(false), HasSDWAScalar(false), HasSDWASdst(false), HasSDWAMac(false), HasSDWAOutModsVOPC(false), HasDPP(false), + HasR128A16(false), HasDLInsts(false), - D16PreservesUnusedBits(false), + HasDotInsts(false), + EnableSRAMECC(false), FlatAddressSpace(false), FlatInstOffsets(false), FlatGlobalInsts(false), @@ -211,7 +219,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), TLInfo(TM, *this), FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { - AS = AMDGPU::getAMDGPUAS(TT); CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); Legalizer.reset(new AMDGPULegalizerInfo(*this, TM)); RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); @@ -447,7 +454,7 @@ unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, const TargetMachine &TM) : R600GenSubtargetInfo(TT, GPU, FS), - AMDGPUSubtarget(TT, getFeatureBits()), + AMDGPUSubtarget(TT), InstrInfo(*this), FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), FMA(false), @@ -460,8 +467,7 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, TexVTXClauseSize(0), Gen(R600), TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), - InstrItins(getInstrItineraryForCPU(GPU)), - AS (AMDGPU::getAMDGPUAS(TT)) { } + InstrItins(getInstrItineraryForCPU(GPU)) { } void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const { @@ -480,10 +486,6 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, Policy.ShouldTrackLaneMasks = true; } -bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const { - return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); -} - unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { if (SGPRs <= 80) diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 6231097336518..5584759e55804 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -63,7 +63,6 @@ private: Triple TargetTriple; protected: - const FeatureBitset &SubtargetFeatureBits; bool Has16BitInsts; bool HasMadMixInsts; bool FP32Denormals; @@ -72,13 +71,15 @@ protected: bool HasVOP3PInsts; bool HasMulI24; bool HasMulU24; + bool HasInv2PiInlineImm; bool HasFminFmaxLegacy; bool EnablePromoteAlloca; + bool HasTrigReducedRange; int LocalMemorySize; unsigned WavefrontSize; public: - AMDGPUSubtarget(const Triple &TT, const FeatureBitset &FeatureBits); + AMDGPUSubtarget(const Triple &TT); static const AMDGPUSubtarget &get(const MachineFunction &MF); static const AMDGPUSubtarget &get(const TargetMachine &TM, @@ -134,7 +135,7 @@ public: return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); } - bool isAmdCodeObjectV2(const Function &F) const { + bool isAmdHsaOrMesa(const Function &F) const { return isAmdHsaOS() || isMesaKernel(F); } @@ -170,10 +171,18 @@ public: return HasMulU24; } + bool hasInv2PiInlineImm() const { + return HasInv2PiInlineImm; + } + bool hasFminFmaxLegacy() const { return HasFminFmaxLegacy; } + bool hasTrigReducedRange() const { + return HasTrigReducedRange; + } + bool isPromoteAllocaEnabled() const { return EnablePromoteAlloca; } @@ -193,38 +202,26 @@ public: /// Returns the offset in bytes from the start of the input buffer /// of the first explicit kernel argument. unsigned getExplicitKernelArgOffset(const Function &F) const { - return isAmdCodeObjectV2(F) ? 0 : 36; + return isAmdHsaOrMesa(F) ? 0 : 36; } /// \returns Maximum number of work groups per compute unit supported by the /// subtarget and limited by given \p FlatWorkGroupSize. - unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const { - return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(SubtargetFeatureBits, - FlatWorkGroupSize); - } + virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0; /// \returns Minimum flat work group size supported by the subtarget. - unsigned getMinFlatWorkGroupSize() const { - return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(SubtargetFeatureBits); - } + virtual unsigned getMinFlatWorkGroupSize() const = 0; /// \returns Maximum flat work group size supported by the subtarget. - unsigned getMaxFlatWorkGroupSize() const { - return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(SubtargetFeatureBits); - } + virtual unsigned getMaxFlatWorkGroupSize() const = 0; /// \returns Maximum number of waves per execution unit supported by the /// subtarget and limited by given \p FlatWorkGroupSize. - unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const { - return AMDGPU::IsaInfo::getMaxWavesPerEU(SubtargetFeatureBits, - FlatWorkGroupSize); - } + virtual unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const = 0; /// \returns Minimum number of waves per execution unit supported by the /// subtarget. - unsigned getMinWavesPerEU() const { - return AMDGPU::IsaInfo::getMinWavesPerEU(SubtargetFeatureBits); - } + virtual unsigned getMinWavesPerEU() const = 0; unsigned getMaxWavesPerEU() const { return 10; } @@ -266,6 +263,7 @@ public: ISAVersion9_0_2, ISAVersion9_0_4, ISAVersion9_0_6, + ISAVersion9_0_9, }; enum TrapHandlerAbi { @@ -300,6 +298,7 @@ protected: Triple TargetTriple; unsigned Gen; unsigned IsaVersion; + InstrItineraryData InstrItins; int LDSBankCount; unsigned MaxPrivateElementSize; @@ -323,11 +322,11 @@ protected: // Used as options. bool EnableHugePrivateBuffer; - bool EnableVGPRSpilling; bool EnableLoadStoreOpt; bool EnableUnsafeDSOffsetFolding; bool EnableSIScheduler; bool EnableDS128; + bool EnablePRTStrictNull; bool DumpCode; // Subtarget statically properties set by tablegen @@ -337,6 +336,7 @@ protected: bool IsGCN; bool GCN3Encoding; bool CIInsts; + bool VIInsts; bool GFX9Insts; bool SGPRInitBug; bool HasSMemRealTime; @@ -346,15 +346,16 @@ protected: bool HasVGPRIndexMode; bool HasScalarStores; bool HasScalarAtomics; - bool HasInv2PiInlineImm; bool HasSDWAOmod; bool HasSDWAScalar; bool HasSDWASdst; bool HasSDWAMac; bool HasSDWAOutModsVOPC; bool HasDPP; + bool HasR128A16; bool HasDLInsts; - bool D16PreservesUnusedBits; + bool HasDotInsts; + bool EnableSRAMECC; bool FlatAddressSpace; bool FlatInstOffsets; bool FlatGlobalInsts; @@ -372,7 +373,6 @@ protected: bool FeatureDisable; SelectionDAGTargetInfo TSInfo; - AMDGPUAS AS; private: SIInstrInfo InstrInfo; SITargetLowering TLInfo; @@ -423,6 +423,10 @@ public: return &TSInfo; } + const InstrItineraryData *getInstrItineraryData() const override { + return &InstrItins; + } + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); Generation getGeneration() const { @@ -441,10 +445,6 @@ public: return MaxPrivateElementSize; } - AMDGPUAS getAMDGPUAS() const { - return AS; - } - bool hasIntClamp() const { return HasIntClamp; } @@ -517,6 +517,10 @@ public: return FMA; } + bool hasSwap() const { + return GFX9Insts; + } + TrapHandlerAbi getTrapHandlerAbi() const { return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; } @@ -574,12 +578,19 @@ public: return getGeneration() < AMDGPUSubtarget::GFX9; } + /// \returns If target requires PRT Struct NULL support (zero result registers + /// for sparse texture support). + bool usePRTStrictNull() const { + return EnablePRTStrictNull; + } + bool hasAutoWaitcntBeforeBarrier() const { return AutoWaitcntBeforeBarrier; } bool hasCodeObjectV3() const { - return CodeObjectV3; + // FIXME: Need to add code object v3 support for mesa and pal. + return isAmdHsaOS() ? CodeObjectV3 : false; } bool hasUnalignedBufferAccess() const { @@ -677,8 +688,12 @@ public: return HasDLInsts; } - bool d16PreservesUnusedBits() const { - return D16PreservesUnusedBits; + bool hasDotInsts() const { + return HasDotInsts; + } + + bool isSRAMECCEnabled() const { + return EnableSRAMECC; } // Scratch is allocated in 256 dword per wave blocks for the entire @@ -707,20 +722,19 @@ public: /// \returns Number of execution units per compute unit supported by the /// subtarget. unsigned getEUsPerCU() const { - return AMDGPU::IsaInfo::getEUsPerCU(MCSubtargetInfo::getFeatureBits()); + return AMDGPU::IsaInfo::getEUsPerCU(this); } /// \returns Maximum number of waves per compute unit supported by the /// subtarget without any kind of limitation. unsigned getMaxWavesPerCU() const { - return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits()); + return AMDGPU::IsaInfo::getMaxWavesPerCU(this); } /// \returns Maximum number of waves per compute unit supported by the /// subtarget and limited by given \p FlatWorkGroupSize. unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const { - return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits(), - FlatWorkGroupSize); + return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize); } /// \returns Maximum number of waves per execution unit supported by the @@ -732,8 +746,7 @@ public: /// \returns Number of waves per work group supported by the subtarget and /// limited by given \p FlatWorkGroupSize. unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const { - return AMDGPU::IsaInfo::getWavesPerWorkGroup( - MCSubtargetInfo::getFeatureBits(), FlatWorkGroupSize); + return AMDGPU::IsaInfo::getWavesPerWorkGroup(this, FlatWorkGroupSize); } // static wrappers @@ -747,8 +760,6 @@ public: void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override; - bool isVGPRSpillingEnabled(const Function &F) const; - unsigned getMaxNumUserSGPRs() const { return 16; } @@ -781,14 +792,15 @@ public: return HasScalarAtomics; } - bool hasInv2PiInlineImm() const { - return HasInv2PiInlineImm; - } bool hasDPP() const { return HasDPP; } + bool hasR128A16() const { + return HasR128A16; + } + bool enableSIScheduler() const { return EnableSIScheduler; } @@ -817,6 +829,11 @@ public: return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; } + // \returns true if the subtarget supports DWORDX3 load/store instructions. + bool hasDwordx3LoadStores() const { + return CIInsts; + } + bool hasSMovFedHazard() const { return getGeneration() >= AMDGPUSubtarget::GFX9; } @@ -851,39 +868,34 @@ public: /// \returns SGPR allocation granularity supported by the subtarget. unsigned getSGPRAllocGranule() const { - return AMDGPU::IsaInfo::getSGPRAllocGranule( - MCSubtargetInfo::getFeatureBits()); + return AMDGPU::IsaInfo::getSGPRAllocGranule(this); } /// \returns SGPR encoding granularity supported by the subtarget. unsigned getSGPREncodingGranule() const { - return AMDGPU::IsaInfo::getSGPREncodingGranule( - MCSubtargetInfo::getFeatureBits()); + return AMDGPU::IsaInfo::getSGPREncodingGranule(this); } /// \returns Total number of SGPRs supported by the subtarget. unsigned getTotalNumSGPRs() const { - return AMDGPU::IsaInfo::getTotalNumSGPRs(MCSubtargetInfo::getFeatureBits()); + return AMDGPU::IsaInfo::getTotalNumSGPRs(this); } /// \returns Addressable number of SGPRs supported by the subtarget. unsigned getAddressableNumSGPRs() const { - return AMDGPU::IsaInfo::getAddressableNumSGPRs( - MCSubtargetInfo::getFeatureBits()); + return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); } /// \returns Minimum number of SGPRs that meets the given number of waves per /// execution unit requirement supported by the subtarget. unsigned getMinNumSGPRs(unsigned WavesPerEU) const { - return AMDGPU::IsaInfo::getMinNumSGPRs(MCSubtargetInfo::getFeatureBits(), - WavesPerEU); + return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); } /// \returns Maximum number of SGPRs that meets the given number of waves per /// execution unit requirement supported by the subtarget. unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { - return AMDGPU::IsaInfo::getMaxNumSGPRs(MCSubtargetInfo::getFeatureBits(), - WavesPerEU, Addressable); + return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); } /// \returns Reserved number of SGPRs for given function \p MF. @@ -901,39 +913,34 @@ public: /// \returns VGPR allocation granularity supported by the subtarget. unsigned getVGPRAllocGranule() const { - return AMDGPU::IsaInfo::getVGPRAllocGranule( - MCSubtargetInfo::getFeatureBits()); + return AMDGPU::IsaInfo::getVGPRAllocGranule(this); } /// \returns VGPR encoding granularity supported by the subtarget. unsigned getVGPREncodingGranule() const { - return AMDGPU::IsaInfo::getVGPREncodingGranule( - MCSubtargetInfo::getFeatureBits()); + return AMDGPU::IsaInfo::getVGPREncodingGranule(this); } /// \returns Total number of VGPRs supported by the subtarget. unsigned getTotalNumVGPRs() const { - return AMDGPU::IsaInfo::getTotalNumVGPRs(MCSubtargetInfo::getFeatureBits()); + return AMDGPU::IsaInfo::getTotalNumVGPRs(this); } /// \returns Addressable number of VGPRs supported by the subtarget. unsigned getAddressableNumVGPRs() const { - return AMDGPU::IsaInfo::getAddressableNumVGPRs( - MCSubtargetInfo::getFeatureBits()); + return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); } /// \returns Minimum number of VGPRs that meets given number of waves per /// execution unit requirement supported by the subtarget. unsigned getMinNumVGPRs(unsigned WavesPerEU) const { - return AMDGPU::IsaInfo::getMinNumVGPRs(MCSubtargetInfo::getFeatureBits(), - WavesPerEU); + return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); } /// \returns Maximum number of VGPRs that meets given number of waves per /// execution unit requirement supported by the subtarget. unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { - return AMDGPU::IsaInfo::getMaxNumVGPRs(MCSubtargetInfo::getFeatureBits(), - WavesPerEU); + return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); } /// \returns Maximum number of VGPRs that meets number of waves per execution @@ -949,6 +956,34 @@ public: void getPostRAMutations( std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const override; + + /// \returns Maximum number of work groups per compute unit supported by the + /// subtarget and limited by given \p FlatWorkGroupSize. + unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { + return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); + } + + /// \returns Minimum flat work group size supported by the subtarget. + unsigned getMinFlatWorkGroupSize() const override { + return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); + } + + /// \returns Maximum flat work group size supported by the subtarget. + unsigned getMaxFlatWorkGroupSize() const override { + return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); + } + + /// \returns Maximum number of waves per execution unit supported by the + /// subtarget and limited by given \p FlatWorkGroupSize. + unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override { + return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize); + } + + /// \returns Minimum number of waves per execution unit supported by the + /// subtarget. + unsigned getMinWavesPerEU() const override { + return AMDGPU::IsaInfo::getMinWavesPerEU(this); + } }; class R600Subtarget final : public R600GenSubtargetInfo, @@ -968,7 +1003,6 @@ private: R600TargetLowering TLInfo; InstrItineraryData InstrItins; SelectionDAGTargetInfo TSInfo; - AMDGPUAS AS; public: R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, @@ -1053,8 +1087,6 @@ public: short getTexVTXClauseSize() const { return TexVTXClauseSize; } - AMDGPUAS getAMDGPUAS() const { return AS; } - bool enableMachineScheduler() const override { return true; } @@ -1062,6 +1094,34 @@ public: bool enableSubRegLiveness() const override { return true; } + + /// \returns Maximum number of work groups per compute unit supported by the + /// subtarget and limited by given \p FlatWorkGroupSize. + unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { + return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); + } + + /// \returns Minimum flat work group size supported by the subtarget. + unsigned getMinFlatWorkGroupSize() const override { + return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); + } + + /// \returns Maximum flat work group size supported by the subtarget. + unsigned getMaxFlatWorkGroupSize() const override { + return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); + } + + /// \returns Maximum number of waves per execution unit supported by the + /// subtarget and limited by given \p FlatWorkGroupSize. + unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override { + return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize); + } + + /// \returns Minimum number of waves per execution unit supported by the + /// subtarget. + unsigned getMinWavesPerEU() const override { + return AMDGPU::IsaInfo::getMinWavesPerEU(this); + } }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 2205819c444ff..e8cefdbf74b97 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -45,6 +45,7 @@ #include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Vectorize.h" #include <memory> @@ -105,6 +106,11 @@ static cl::opt<bool> EnableSDWAPeephole( cl::desc("Enable SDWA peepholer"), cl::init(true)); +static cl::opt<bool> EnableDPPCombine( + "amdgpu-dpp-combine", + cl::desc("Enable DPP combiner"), + cl::init(false)); + // Enable address space based alias analysis static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, cl::desc("Enable AMDGPU Alias Analysis"), @@ -137,6 +143,20 @@ static cl::opt<bool> EnableLowerKernelArguments( cl::init(true), cl::Hidden); +// Enable atomic optimization +static cl::opt<bool> EnableAtomicOptimizations( + "amdgpu-atomic-optimizations", + cl::desc("Enable atomic optimizations"), + cl::init(false), + cl::Hidden); + +// Enable Mode register optimization +static cl::opt<bool> EnableSIModeRegisterPass( + "amdgpu-mode-register", + cl::desc("Enable mode register pass"), + cl::init(true), + cl::Hidden); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget()); @@ -150,18 +170,22 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeR600VectorRegMergerPass(*PR); initializeGlobalISel(*PR); initializeAMDGPUDAGToDAGISelPass(*PR); + initializeGCNDPPCombinePass(*PR); initializeSILowerI1CopiesPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); + initializeSIFixupVectorISelPass(*PR); initializeSIFoldOperandsPass(*PR); initializeSIPeepholeSDWAPass(*PR); initializeSIShrinkInstructionsPass(*PR); initializeSIOptimizeExecMaskingPreRAPass(*PR); initializeSILoadStoreOptimizerPass(*PR); + initializeAMDGPUFixFunctionBitcastsPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); initializeAMDGPUArgumentUsageInfoPass(*PR); + initializeAMDGPUAtomicOptimizerPass(*PR); initializeAMDGPULowerKernelArgumentsPass(*PR); initializeAMDGPULowerKernelAttributesPass(*PR); initializeAMDGPULowerIntrinsicsPass(*PR); @@ -172,6 +196,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); initializeSIInsertWaitcntsPass(*PR); + initializeSIModeRegisterPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); initializeSIInsertSkipsPass(*PR); @@ -182,6 +207,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeSIFormMemoryClausesPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); + initializeAMDGPUExternalAAWrapperPass(*PR); initializeAMDGPUUseNativeCallsPass(*PR); initializeAMDGPUSimplifyLibCallsPass(*PR); initializeAMDGPUInlinerPass(*PR); @@ -292,12 +318,6 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { return Reloc::PIC_; } -static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) { - if (CM) - return *CM; - return CodeModel::Small; -} - AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, @@ -306,9 +326,8 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OptLevel) : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), FS, Options, getEffectiveRelocModel(RM), - getEffectiveCodeModel(CM), OptLevel), + getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), TLOF(createTLOF(getTargetTriple())) { - AS = AMDGPU::getAMDGPUAS(TT); initAsmInfo(); } @@ -331,13 +350,6 @@ StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { FSAttr.getValueAsString(); } -static ImmutablePass *createAMDGPUExternalAAWrapperPass() { - return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) { - if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>()) - AAR.addAAResult(WrapperPass->getResult()); - }); -} - /// Predicate for Internalize pass. static bool mustPreserveGV(const GlobalValue &GV) { if (const Function *F = dyn_cast<Function>(&GV)) @@ -360,17 +372,6 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { Builder.Inliner = createAMDGPUFunctionInliningPass(); } - if (Internalize) { - // If we're generating code, we always have the whole program available. The - // relocations expected for externally visible functions aren't supported, - // so make sure every non-entry function is hidden. - Builder.addExtension( - PassManagerBuilder::EP_EnabledOnOptLevel0, - [](const PassManagerBuilder &, legacy::PassManagerBase &PM) { - PM.add(createInternalizePass(mustPreserveGV)); - }); - } - Builder.addExtension( PassManagerBuilder::EP_ModuleOptimizerEarly, [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &, @@ -613,20 +614,23 @@ void AMDGPUPassConfig::addIRPasses() { disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); + addPass(createAtomicExpandPass()); + + // This must occur before inlining, as the inliner will not look through + // bitcast calls. + addPass(createAMDGPUFixFunctionBitcastsPass()); + addPass(createAMDGPULowerIntrinsicsPass()); - if (TM.getTargetTriple().getArch() == Triple::r600 || - !EnableAMDGPUFunctionCalls) { - // Function calls are not supported, so make sure we inline everything. - addPass(createAMDGPUAlwaysInlinePass()); - addPass(createAlwaysInlinerLegacyPass()); - // We need to add the barrier noop pass, otherwise adding the function - // inlining pass will cause all of the PassConfigs passes to be run - // one function at a time, which means if we have a nodule with two - // functions, then we will generate code for the first function - // without ever running any passes on the second. - addPass(createBarrierNoopPass()); - } + // Function calls are not supported, so make sure we inline everything. + addPass(createAMDGPUAlwaysInlinePass()); + addPass(createAlwaysInlinerLegacyPass()); + // We need to add the barrier noop pass, otherwise adding the function + // inlining pass will cause all of the PassConfigs passes to be run + // one function at a time, which means if we have a nodule with two + // functions, then we will generate code for the first function + // without ever running any passes on the second. + addPass(createBarrierNoopPass()); if (TM.getTargetTriple().getArch() == Triple::amdgcn) { // TODO: May want to move later or split into an early and late one. @@ -690,6 +694,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() { } bool AMDGPUPassConfig::addPreISel() { + addPass(createLowerSwitchPass()); addPass(createFlattenCFGPass()); return false; } @@ -759,6 +764,10 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); + if (EnableAtomicOptimizations) { + addPass(createAMDGPUAtomicOptimizerPass()); + } + // FIXME: We need to run a pass to propagate the attributes when calls are // supported. addPass(createAMDGPUAnnotateKernelFeaturesPass()); @@ -789,6 +798,8 @@ void GCNPassConfig::addMachineSSAOptimization() { // // XXX - Can we get away without running DeadMachineInstructionElim again? addPass(&SIFoldOperandsID); + if (EnableDPPCombine) + addPass(&GCNDPPCombineID); addPass(&DeadMachineInstructionElimID); addPass(&SILoadStoreOptimizerID); if (EnableSDWAPeephole) { @@ -811,8 +822,10 @@ bool GCNPassConfig::addILPOpts() { bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); - addPass(createSILowerI1CopiesPass()); addPass(&SIFixSGPRCopiesID); + addPass(createSILowerI1CopiesPass()); + addPass(createSIFixupVectorISelPass()); + addPass(createSIAddIMGInitPass()); return false; } @@ -878,7 +891,8 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { void GCNPassConfig::addPostRegAlloc() { addPass(&SIFixVGPRCopiesID); - addPass(&SIOptimizeExecMaskingID); + if (getOptLevel() > CodeGenOpt::None) + addPass(&SIOptimizeExecMaskingID); TargetPassConfig::addPostRegAlloc(); } @@ -889,6 +903,7 @@ void GCNPassConfig::addPreEmitPass() { addPass(createSIMemoryLegalizerPass()); addPass(createSIInsertWaitcntsPass()); addPass(createSIShrinkInstructionsPass()); + addPass(createSIModeRegisterPass()); // The hazard recognizer that runs as part of the post-ra scheduler does not // guarantee to be able handle all hazards correctly. This is because if there diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 0fe14493fabdd..62fbe71d19023 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -34,7 +34,6 @@ namespace llvm { class AMDGPUTargetMachine : public LLVMTargetMachine { protected: std::unique_ptr<TargetLoweringObjectFile> TLOF; - AMDGPUAS AS; StringRef getGPUName(const Function &F) const; StringRef getFeatureString(const Function &F) const; @@ -55,16 +54,13 @@ public: TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } - AMDGPUAS getAMDGPUAS() const { - return AS; - } void adjustPassManager(PassManagerBuilder &) override; + /// Get the integer value of a null pointer in the given address space. uint64_t getNullPointerValue(unsigned AddrSpace) const { - if (AddrSpace == AS.LOCAL_ADDRESS || AddrSpace == AS.REGION_ADDRESS) - return -1; - return 0; + return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || + AddrSpace == AMDGPUAS::REGION_ADDRESS) ? -1 : 0; } }; diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp index e2f718bd3c34d..c4e1efde130b8 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp @@ -29,3 +29,13 @@ MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal( return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM); } + +MCSection *AMDGPUTargetObjectFile::getExplicitSectionGlobal( + const GlobalObject *GO, SectionKind SK, const TargetMachine &TM) const { + // Set metadata access for the explicit section + StringRef SectionName = GO->getSection(); + if (SectionName.startswith(".AMDGPU.comment.")) + SK = SectionKind::getMetadata(); + + return TargetLoweringObjectFileELF::getExplicitSectionGlobal(GO, SK, TM); +} diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h index dd9dc1a88fc2b..a4ae1a2c18c26 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h +++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h @@ -26,6 +26,8 @@ class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF { public: MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override; + MCSection *getExplicitSectionGlobal(const GlobalObject *GO, SectionKind Kind, + const TargetMachine &TM) const override; }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index a68b8d03f06e2..11e4ba4b5010d 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -102,7 +102,6 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, unsigned ThresholdPrivate = UnrollThresholdPrivate; unsigned ThresholdLocal = UnrollThresholdLocal; unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); - const AMDGPUAS &ASST = AMDGPU::getAMDGPUAS(TargetTriple); for (const BasicBlock *BB : L->getBlocks()) { const DataLayout &DL = BB->getModule()->getDataLayout(); unsigned LocalGEPsSeen = 0; @@ -140,9 +139,9 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, unsigned AS = GEP->getAddressSpace(); unsigned Threshold = 0; - if (AS == ASST.PRIVATE_ADDRESS) + if (AS == AMDGPUAS::PRIVATE_ADDRESS) Threshold = ThresholdPrivate; - else if (AS == ASST.LOCAL_ADDRESS) + else if (AS == AMDGPUAS::LOCAL_ADDRESS) Threshold = ThresholdLocal; else continue; @@ -150,7 +149,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, if (UP.Threshold >= Threshold) continue; - if (AS == ASST.PRIVATE_ADDRESS) { + if (AS == AMDGPUAS::PRIVATE_ADDRESS) { const Value *Ptr = GEP->getPointerOperand(); const AllocaInst *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); @@ -160,7 +159,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0; if (AllocaSize > MaxAlloca) continue; - } else if (AS == ASST.LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { LocalGEPsSeen++; // Inhibit unroll for local memory if we have seen addressing not to // a variable, most likely we will be unable to combine it. @@ -253,19 +252,18 @@ unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize, } unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { - AMDGPUAS AS = ST->getAMDGPUAS(); - if (AddrSpace == AS.GLOBAL_ADDRESS || - AddrSpace == AS.CONSTANT_ADDRESS || - AddrSpace == AS.CONSTANT_ADDRESS_32BIT) { + if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || + AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || + AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { return 512; } - if (AddrSpace == AS.FLAT_ADDRESS || - AddrSpace == AS.LOCAL_ADDRESS || - AddrSpace == AS.REGION_ADDRESS) + if (AddrSpace == AMDGPUAS::FLAT_ADDRESS || + AddrSpace == AMDGPUAS::LOCAL_ADDRESS || + AddrSpace == AMDGPUAS::REGION_ADDRESS) return 128; - if (AddrSpace == AS.PRIVATE_ADDRESS) + if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) return 8 * ST->getMaxPrivateElementSize(); llvm_unreachable("unhandled address space"); @@ -277,7 +275,7 @@ bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, // We allow vectorization of flat stores, even though we may need to decompose // them later if they may access private memory. We don't have enough context // here, and legalization can handle it. - if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) { + if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && ChainSizeInBytes <= ST->getMaxPrivateElementSize(); } @@ -545,14 +543,15 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { if (const Argument *A = dyn_cast<Argument>(V)) return !isArgPassedInSGPR(A); - // Loads from the private address space are divergent, because threads - // can execute the load instruction with the same inputs and get different - // results. + // Loads from the private and flat address spaces are divergent, because + // threads can execute the load instruction with the same inputs and get + // different results. // // All other loads are not divergent, because if threads issue loads with the // same arguments, they will always get the same result. if (const LoadInst *Load = dyn_cast<LoadInst>(V)) - return Load->getPointerAddressSpace() == ST->getAMDGPUAS().PRIVATE_ADDRESS; + return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS || + Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS; // Atomics are divergent because they are executed sequentially: when an // atomic operation refers to the same address in each thread, then each @@ -642,20 +641,19 @@ unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const { } unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { - AMDGPUAS AS = ST->getAMDGPUAS(); - if (AddrSpace == AS.GLOBAL_ADDRESS || - AddrSpace == AS.CONSTANT_ADDRESS) + if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || + AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) return 128; - if (AddrSpace == AS.LOCAL_ADDRESS || - AddrSpace == AS.REGION_ADDRESS) + if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || + AddrSpace == AMDGPUAS::REGION_ADDRESS) return 64; - if (AddrSpace == AS.PRIVATE_ADDRESS) + if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) return 32; - if ((AddrSpace == AS.PARAM_D_ADDRESS || - AddrSpace == AS.PARAM_I_ADDRESS || - (AddrSpace >= AS.CONSTANT_BUFFER_0 && - AddrSpace <= AS.CONSTANT_BUFFER_15))) + if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS || + AddrSpace == AMDGPUAS::PARAM_I_ADDRESS || + (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 && + AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15))) return 128; llvm_unreachable("unhandled address space"); } @@ -666,9 +664,7 @@ bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, // We allow vectorization of flat stores, even though we may need to decompose // them later if they may access private memory. We don't have enough context // here, and legalization can handle it. - if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) - return false; - return true; + return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS); } bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 8e63d789e17d7..397c5c6fa6fbe 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -179,7 +179,7 @@ public: if (IsGraphicsShader) return -1; return ST->hasFlatAddressSpace() ? - ST->getAMDGPUAS().FLAT_ADDRESS : ST->getAMDGPUAS().UNKNOWN_ADDRESS_SPACE; + AMDGPUAS::FLAT_ADDRESS : AMDGPUAS::UNKNOWN_ADDRESS_SPACE; } unsigned getVectorSplitCost() { return 0; } diff --git a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 0d3a1673696a5..ced3f6f567e2f 100644 --- a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -25,7 +25,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Transforms/Utils/Local.h" @@ -70,7 +70,7 @@ char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID; INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, "Unify divergent function exit nodes", false, false) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, "Unify divergent function exit nodes", false, false) @@ -78,10 +78,10 @@ void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{ // TODO: Preserve dominator tree. AU.addRequired<PostDominatorTreeWrapperPass>(); - AU.addRequired<DivergenceAnalysis>(); + AU.addRequired<LegacyDivergenceAnalysis>(); // No divergent values are changed, only blocks and branch edges. - AU.addPreserved<DivergenceAnalysis>(); + AU.addPreserved<LegacyDivergenceAnalysis>(); // We preserve the non-critical-edgeness property AU.addPreservedID(BreakCriticalEdgesID); @@ -95,7 +95,7 @@ void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{ /// \returns true if \p BB is reachable through only uniform branches. /// XXX - Is there a more efficient way to find this? -static bool isUniformlyReached(const DivergenceAnalysis &DA, +static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA, BasicBlock &BB) { SmallVector<BasicBlock *, 8> Stack; SmallPtrSet<BasicBlock *, 8> Visited; @@ -163,7 +163,7 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { if (PDT.getRoots().size() <= 1) return false; - DivergenceAnalysis &DA = getAnalysis<DivergenceAnalysis>(); + LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>(); // Loop over all of the blocks in a function, tracking all of the blocks that // return. diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 31e2885c833d9..3f9af27a2e5e1 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -49,6 +49,7 @@ #include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/SMLoc.h" +#include "llvm/Support/TargetParser.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> @@ -156,13 +157,12 @@ public: ImmTyDMask, ImmTyUNorm, ImmTyDA, - ImmTyR128, + ImmTyR128A16, ImmTyLWE, ImmTyExpTgt, ImmTyExpCompr, ImmTyExpVM, - ImmTyDFMT, - ImmTyNFMT, + ImmTyFORMAT, ImmTyHwreg, ImmTyOff, ImmTySendMsg, @@ -291,7 +291,7 @@ public: bool isDMask() const { return isImmTy(ImmTyDMask); } bool isUNorm() const { return isImmTy(ImmTyUNorm); } bool isDA() const { return isImmTy(ImmTyDA); } - bool isR128() const { return isImmTy(ImmTyR128); } + bool isR128A16() const { return isImmTy(ImmTyR128A16); } bool isLWE() const { return isImmTy(ImmTyLWE); } bool isOff() const { return isImmTy(ImmTyOff); } bool isExpTgt() const { return isImmTy(ImmTyExpTgt); } @@ -312,8 +312,7 @@ public: bool isSLC() const { return isImmTy(ImmTySLC); } bool isTFE() const { return isImmTy(ImmTyTFE); } bool isD16() const { return isImmTy(ImmTyD16); } - bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); } - bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); } + bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<8>(getImm()); } bool isBankMask() const { return isImmTy(ImmTyDppBankMask); } bool isRowMask() const { return isImmTy(ImmTyDppRowMask); } bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); } @@ -666,8 +665,7 @@ public: case ImmTySLC: OS << "SLC"; break; case ImmTyTFE: OS << "TFE"; break; case ImmTyD16: OS << "D16"; break; - case ImmTyDFMT: OS << "DFMT"; break; - case ImmTyNFMT: OS << "NFMT"; break; + case ImmTyFORMAT: OS << "FORMAT"; break; case ImmTyClampSI: OS << "ClampSI"; break; case ImmTyOModSI: OS << "OModSI"; break; case ImmTyDppCtrl: OS << "DppCtrl"; break; @@ -681,7 +679,7 @@ public: case ImmTyDMask: OS << "DMask"; break; case ImmTyUNorm: OS << "UNorm"; break; case ImmTyDA: OS << "DA"; break; - case ImmTyR128: OS << "R128"; break; + case ImmTyR128A16: OS << "R128A16"; break; case ImmTyLWE: OS << "LWE"; break; case ImmTyOff: OS << "Off"; break; case ImmTyExpTgt: OS << "ExpTgt"; break; @@ -920,8 +918,7 @@ public: // Currently there is none suitable machinery in the core llvm-mc for this. // MCSymbol::isRedefinable is intended for another purpose, and // AsmParser::parseDirectiveSet() cannot be specialized for specific target. - AMDGPU::IsaInfo::IsaVersion ISA = - AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()); + AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); MCContext &Ctx = getContext(); if (ISA.Major >= 6 && AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) { MCSymbol *Sym = @@ -1061,6 +1058,7 @@ public: OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands); OperandMatchResultTy parseRegWithIntInputMods(OperandVector &Operands); OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands); + OperandMatchResultTy parseDfmtNfmt(OperandVector &Operands); void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands); void cvtDS(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, false); } @@ -1092,7 +1090,6 @@ private: bool validateMIMGAtomicDMask(const MCInst &Inst); bool validateMIMGGatherDMask(const MCInst &Inst); bool validateMIMGDataSize(const MCInst &Inst); - bool validateMIMGR128(const MCInst &Inst); bool validateMIMGD16(const MCInst &Inst); bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; @@ -1829,7 +1826,7 @@ bool AMDGPUAsmParser::updateGprCountSymbols(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) { // Symbols are only defined for GCN targets - if (AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()).Major < 6) + if (AMDGPU::getIsaVersion(getSTI().getCPU()).Major < 6) return true; auto SymbolName = getGprCountSymbolName(RegKind); @@ -2447,22 +2444,6 @@ bool AMDGPUAsmParser::validateMIMGGatherDMask(const MCInst &Inst) { return DMask == 0x1 || DMask == 0x2 || DMask == 0x4 || DMask == 0x8; } -bool AMDGPUAsmParser::validateMIMGR128(const MCInst &Inst) { - - const unsigned Opc = Inst.getOpcode(); - const MCInstrDesc &Desc = MII.get(Opc); - - if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0) - return true; - - int Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::r128); - assert(Idx != -1); - - bool R128 = (Inst.getOperand(Idx).getImm() != 0); - - return !R128 || hasMIMG_R128(); -} - bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) { const unsigned Opc = Inst.getOpcode(); @@ -2497,11 +2478,6 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, "integer clamping is not supported on this GPU"); return false; } - if (!validateMIMGR128(Inst)) { - Error(IDLoc, - "r128 modifier is not supported on this GPU"); - return false; - } // For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate. if (!validateMIMGD16(Inst)) { Error(IDLoc, @@ -2661,18 +2637,18 @@ bool AMDGPUAsmParser::calculateGPRBlocks( unsigned &SGPRBlocks) { // TODO(scott.linder): These calculations are duplicated from // AMDGPUAsmPrinter::getSIProgramInfo and could be unified. - IsaInfo::IsaVersion Version = IsaInfo::getIsaVersion(Features); + IsaVersion Version = getIsaVersion(getSTI().getCPU()); unsigned NumVGPRs = NextFreeVGPR; unsigned NumSGPRs = NextFreeSGPR; - unsigned MaxAddressableNumSGPRs = IsaInfo::getAddressableNumSGPRs(Features); + unsigned MaxAddressableNumSGPRs = IsaInfo::getAddressableNumSGPRs(&getSTI()); if (Version.Major >= 8 && !Features.test(FeatureSGPRInitBug) && NumSGPRs > MaxAddressableNumSGPRs) return OutOfRangeError(SGPRRange); NumSGPRs += - IsaInfo::getNumExtraSGPRs(Features, VCCUsed, FlatScrUsed, XNACKUsed); + IsaInfo::getNumExtraSGPRs(&getSTI(), VCCUsed, FlatScrUsed, XNACKUsed); if ((Version.Major <= 7 || Features.test(FeatureSGPRInitBug)) && NumSGPRs > MaxAddressableNumSGPRs) @@ -2681,8 +2657,8 @@ bool AMDGPUAsmParser::calculateGPRBlocks( if (Features.test(FeatureSGPRInitBug)) NumSGPRs = IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; - VGPRBlocks = IsaInfo::getNumVGPRBlocks(Features, NumVGPRs); - SGPRBlocks = IsaInfo::getNumSGPRBlocks(Features, NumSGPRs); + VGPRBlocks = IsaInfo::getNumVGPRBlocks(&getSTI(), NumVGPRs); + SGPRBlocks = IsaInfo::getNumSGPRBlocks(&getSTI(), NumSGPRs); return false; } @@ -2702,8 +2678,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() { StringSet<> Seen; - IsaInfo::IsaVersion IVersion = - IsaInfo::getIsaVersion(getSTI().getFeatureBits()); + IsaVersion IVersion = getIsaVersion(getSTI().getCPU()); SMRange VGPRRange; uint64_t NextFreeVGPR = 0; @@ -2962,8 +2937,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { // If this directive has no arguments, then use the ISA version for the // targeted GPU. if (getLexer().is(AsmToken::EndOfStatement)) { - AMDGPU::IsaInfo::IsaVersion ISA = - AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()); + AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); getTargetStreamer().EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU"); @@ -3025,7 +2999,7 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID, bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() { amd_kernel_code_t Header; - AMDGPU::initDefaultAMDKernelCodeT(Header, getFeatureBits()); + AMDGPU::initDefaultAMDKernelCodeT(Header, &getSTI()); while (true) { // Lex EndOfStatement. This is in a while loop, because lexing a comment @@ -3091,9 +3065,18 @@ bool AMDGPUAsmParser::ParseDirectiveISAVersion() { } bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() { + const char *AssemblerDirectiveBegin; + const char *AssemblerDirectiveEnd; + std::tie(AssemblerDirectiveBegin, AssemblerDirectiveEnd) = + AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI()) + ? std::make_tuple(HSAMD::V3::AssemblerDirectiveBegin, + HSAMD::V3::AssemblerDirectiveEnd) + : std::make_tuple(HSAMD::AssemblerDirectiveBegin, + HSAMD::AssemblerDirectiveEnd); + if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA) { return Error(getParser().getTok().getLoc(), - (Twine(HSAMD::AssemblerDirectiveBegin) + Twine(" directive is " + (Twine(AssemblerDirectiveBegin) + Twine(" directive is " "not available on non-amdhsa OSes")).str()); } @@ -3111,7 +3094,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() { if (getLexer().is(AsmToken::Identifier)) { StringRef ID = getLexer().getTok().getIdentifier(); - if (ID == AMDGPU::HSAMD::AssemblerDirectiveEnd) { + if (ID == AssemblerDirectiveEnd) { Lex(); FoundEnd = true; break; @@ -3133,8 +3116,13 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() { YamlStream.flush(); - if (!getTargetStreamer().EmitHSAMetadata(HSAMetadataString)) - return Error(getParser().getTok().getLoc(), "invalid HSA metadata"); + if (IsaInfo::hasCodeObjectV3(&getSTI())) { + if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString)) + return Error(getParser().getTok().getLoc(), "invalid HSA metadata"); + } else { + if (!getTargetStreamer().EmitHSAMetadataV2(HSAMetadataString)) + return Error(getParser().getTok().getLoc(), "invalid HSA metadata"); + } return false; } @@ -3171,6 +3159,10 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".amdhsa_kernel") return ParseDirectiveAMDHSAKernel(); + + // TODO: Restructure/combine with PAL metadata directive. + if (IDVal == AMDGPU::HSAMD::V3::AssemblerDirectiveBegin) + return ParseDirectiveHSAMetadata(); } else { if (IDVal == ".hsa_code_object_version") return ParseDirectiveHSACodeObjectVersion(); @@ -3186,10 +3178,10 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".amd_amdgpu_isa") return ParseDirectiveISAVersion(); - } - if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin) - return ParseDirectiveHSAMetadata(); + if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin) + return ParseDirectiveHSAMetadata(); + } if (IDVal == PALMD::AssemblerDirective) return ParseDirectivePALMetadata(); @@ -3465,6 +3457,10 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, case AsmToken::Identifier: { StringRef Tok = Parser.getTok().getString(); if (Tok == Name) { + if (Tok == "r128" && isGFX9()) + Error(S, "r128 modifier is not supported on this GPU"); + if (Tok == "a16" && !isGFX9()) + Error(S, "a16 modifier is not supported on this GPU"); Bit = 1; Parser.Lex(); } else if (Tok.startswith("no") && Tok.endswith(Name)) { @@ -3522,6 +3518,53 @@ AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix, StringRef &Value) { return MatchOperand_Success; } +// dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their +// values to live in a joint format operand in the MCInst encoding. +OperandMatchResultTy +AMDGPUAsmParser::parseDfmtNfmt(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + int64_t Dfmt = 0, Nfmt = 0; + // dfmt and nfmt can appear in either order, and each is optional. + bool GotDfmt = false, GotNfmt = false; + while (!GotDfmt || !GotNfmt) { + if (!GotDfmt) { + auto Res = parseIntWithPrefix("dfmt", Dfmt); + if (Res != MatchOperand_NoMatch) { + if (Res != MatchOperand_Success) + return Res; + if (Dfmt >= 16) { + Error(Parser.getTok().getLoc(), "out of range dfmt"); + return MatchOperand_ParseFail; + } + GotDfmt = true; + Parser.Lex(); + continue; + } + } + if (!GotNfmt) { + auto Res = parseIntWithPrefix("nfmt", Nfmt); + if (Res != MatchOperand_NoMatch) { + if (Res != MatchOperand_Success) + return Res; + if (Nfmt >= 8) { + Error(Parser.getTok().getLoc(), "out of range nfmt"); + return MatchOperand_ParseFail; + } + GotNfmt = true; + Parser.Lex(); + continue; + } + } + break; + } + if (!GotDfmt && !GotNfmt) + return MatchOperand_NoMatch; + auto Format = Dfmt | Nfmt << 4; + Operands.push_back( + AMDGPUOperand::CreateImm(this, Format, S, AMDGPUOperand::ImmTyFORMAT)); + return MatchOperand_Success; +} + //===----------------------------------------------------------------------===// // ds //===----------------------------------------------------------------------===// @@ -3652,12 +3695,12 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) { static bool encodeCnt( - const AMDGPU::IsaInfo::IsaVersion ISA, + const AMDGPU::IsaVersion ISA, int64_t &IntVal, int64_t CntVal, bool Saturate, - unsigned (*encode)(const IsaInfo::IsaVersion &Version, unsigned, unsigned), - unsigned (*decode)(const IsaInfo::IsaVersion &Version, unsigned)) + unsigned (*encode)(const IsaVersion &Version, unsigned, unsigned), + unsigned (*decode)(const IsaVersion &Version, unsigned)) { bool Failed = false; @@ -3688,8 +3731,7 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { if (getParser().parseAbsoluteExpression(CntVal)) return true; - AMDGPU::IsaInfo::IsaVersion ISA = - AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()); + AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); bool Failed = true; bool Sat = CntName.endswith("_sat"); @@ -3724,8 +3766,7 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { OperandMatchResultTy AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { - AMDGPU::IsaInfo::IsaVersion ISA = - AMDGPU::IsaInfo::getIsaVersion(getFeatureBits()); + AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU()); int64_t Waitcnt = getWaitcntBitMask(ISA); SMLoc S = Parser.getTok().getLoc(); @@ -4617,8 +4658,7 @@ void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDFMT); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyNFMT); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyFORMAT); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); @@ -4661,7 +4701,7 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands, addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA); @@ -4761,8 +4801,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"lds", AMDGPUOperand::ImmTyLDS, true, nullptr}, {"offset", AMDGPUOperand::ImmTyOffset, false, nullptr}, {"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr}, - {"dfmt", AMDGPUOperand::ImmTyDFMT, false, nullptr}, - {"nfmt", AMDGPUOperand::ImmTyNFMT, false, nullptr}, + {"dfmt", AMDGPUOperand::ImmTyFORMAT, false, nullptr}, {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr}, {"slc", AMDGPUOperand::ImmTySLC, true, nullptr}, {"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr}, @@ -4772,7 +4811,8 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"omod", AMDGPUOperand::ImmTyOModSI, false, ConvertOmodMul}, {"unorm", AMDGPUOperand::ImmTyUNorm, true, nullptr}, {"da", AMDGPUOperand::ImmTyDA, true, nullptr}, - {"r128", AMDGPUOperand::ImmTyR128, true, nullptr}, + {"r128", AMDGPUOperand::ImmTyR128A16, true, nullptr}, + {"a16", AMDGPUOperand::ImmTyR128A16, true, nullptr}, {"lwe", AMDGPUOperand::ImmTyLWE, true, nullptr}, {"d16", AMDGPUOperand::ImmTyD16, true, nullptr}, {"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr}, @@ -4844,6 +4884,8 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOpr(OperandVector &Operands) Op.Type == AMDGPUOperand::ImmTyNegHi) { res = parseOperandArrayWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult); + } else if (Op.Type == AMDGPUOperand::ImmTyFORMAT) { + res = parseDfmtNfmt(Operands); } else { res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult); } @@ -5251,12 +5293,14 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); } - // All DPP instructions with at least one source operand have a fake "old" - // source at the beginning that's tied to the dst operand. Handle it here. - if (Desc.getNumOperands() >= 2) - Inst.addOperand(Inst.getOperand(0)); - for (unsigned E = Operands.size(); I != E; ++I) { + auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(), + MCOI::TIED_TO); + if (TiedTo != -1) { + assert((unsigned)TiedTo < Inst.getNumOperands()); + // handle tied old or src2 for MAC instructions + Inst.addOperand(Inst.getOperand(TiedTo)); + } AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); // Add the register arguments if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) { diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td index b87c47a6b9eea..51c2abeac2ffb 100644 --- a/lib/Target/AMDGPU/BUFInstructions.td +++ b/lib/Target/AMDGPU/BUFInstructions.td @@ -17,14 +17,12 @@ def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">; def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; -def MUBUFIntrinsicOffset : ComplexPattern<i32, 2, "SelectMUBUFIntrinsicOffset">; -def MUBUFIntrinsicVOffset : ComplexPattern<i32, 3, "SelectMUBUFIntrinsicVOffset">; class MubufLoad <SDPatternOperator op> : PatFrag < (ops node:$ptr), (op node:$ptr), [{ auto const AS = cast<MemSDNode>(N)->getAddressSpace(); - return AS == AMDGPUASI.GLOBAL_ADDRESS || - AS == AMDGPUASI.CONSTANT_ADDRESS; + return AS == AMDGPUAS::GLOBAL_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS; }]>; def mubuf_load : MubufLoad <load>; @@ -100,15 +98,11 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins, bits<1> has_vaddr = 1; bits<1> has_glc = 1; bits<1> glc_value = 0; // the value for glc if no such operand - bits<4> dfmt_value = 1; // the value for dfmt if no such operand - bits<3> nfmt_value = 0; // the value for nfmt if no such operand bits<1> has_srsrc = 1; bits<1> has_soffset = 1; bits<1> has_offset = 1; bits<1> has_slc = 1; bits<1> has_tfe = 1; - bits<1> has_dfmt = 1; - bits<1> has_nfmt = 1; } class MTBUF_Real <MTBUF_Pseudo ps> : @@ -126,14 +120,16 @@ class MTBUF_Real <MTBUF_Pseudo ps> : bits<12> offset; bits<1> glc; - bits<4> dfmt; - bits<3> nfmt; + bits<7> format; bits<8> vaddr; bits<8> vdata; bits<7> srsrc; bits<1> slc; bits<1> tfe; bits<8> soffset; + + bits<4> dfmt = format{3-0}; + bits<3> nfmt = format{6-4}; } class getMTBUFInsDA<list<RegisterClass> vdataList, @@ -142,16 +138,16 @@ class getMTBUFInsDA<list<RegisterClass> vdataList, RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); dag InsNoData = !if(!empty(vaddrList), (ins SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe), + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe), (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, SLC:$slc, TFE:$tfe) + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe) ); dag InsData = !if(!empty(vaddrList), (ins vdataClass:$vdata, SReg_128:$srsrc, - SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, + SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe), (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, - SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, + SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe) ); dag ret = !if(!empty(vdataList), InsNoData, InsData); @@ -169,15 +165,15 @@ class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[]> { class getMTBUFAsmOps<int addrKind> { string Pfx = - !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $dfmt, $nfmt, $soffset", + !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $format, $soffset", !if(!eq(addrKind, BUFAddrKind.OffEn), - "$vaddr, $srsrc, $dfmt, $nfmt, $soffset offen", + "$vaddr, $srsrc, $format, $soffset offen", !if(!eq(addrKind, BUFAddrKind.IdxEn), - "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen", + "$vaddr, $srsrc, $format, $soffset idxen", !if(!eq(addrKind, BUFAddrKind.BothEn), - "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen offen", + "$vaddr, $srsrc, $format, $soffset idxen offen", !if(!eq(addrKind, BUFAddrKind.Addr64), - "$vaddr, $srsrc, $dfmt, $nfmt, $soffset addr64", + "$vaddr, $srsrc, $format, $soffset addr64", ""))))); string ret = Pfx # "$offset"; } @@ -217,14 +213,14 @@ multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, [(set load_vt:$vdata, - (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt, - i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>, + (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$format, + i1:$glc, i1:$slc, i1:$tfe)))]>, MTBUFAddr64Table<0, NAME>; def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, - i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>, + i8:$format, i1:$glc, i1:$slc, i1:$tfe)))]>, MTBUFAddr64Table<1, NAME>; def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; @@ -263,13 +259,13 @@ multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc, + i16:$offset, i8:$format, i1:$glc, i1:$slc, i1:$tfe))]>, MTBUFAddr64Table<0, NAME>; def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc, + i16:$offset, i8:$format, i1:$glc, i1:$slc, i1:$tfe))]>, MTBUFAddr64Table<1, NAME>; @@ -290,6 +286,12 @@ multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, // MUBUF classes //===----------------------------------------------------------------------===// +class MUBUFGetBaseOpcode<string Op> { + string ret = !subst("DWORDX2", "DWORD", + !subst("DWORDX3", "DWORD", + !subst("DWORDX4", "DWORD", Op))); +} + class MUBUF_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]> : InstSI<outs, ins, "", pattern>, @@ -303,6 +305,9 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins, string Mnemonic = opName; string AsmOperands = asmOps; + Instruction Opcode = !cast<Instruction>(NAME); + Instruction BaseOpcode = !cast<Instruction>(MUBUFGetBaseOpcode<NAME>.ret); + let VM_CNT = 1; let EXP_CNT = 1; let MUBUF = 1; @@ -325,6 +330,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins, bits<1> has_offset = 1; bits<1> has_slc = 1; bits<1> has_tfe = 1; + bits<4> dwords = 0; } class MUBUF_Real <bits<7> op, MUBUF_Pseudo ps> : @@ -398,6 +404,16 @@ class getMUBUFInsDA<list<RegisterClass> vdataList, ); } +class getMUBUFDwords<RegisterClass regClass> { + string regClassAsInt = !cast<string>(regClass); + int ret = + !if(!eq(regClassAsInt, !cast<string>(VGPR_32)), 1, + !if(!eq(regClassAsInt, !cast<string>(VReg_64)), 2, + !if(!eq(regClassAsInt, !cast<string>(VReg_96)), 3, + !if(!eq(regClassAsInt, !cast<string>(VReg_128)), 4, + 0)))); +} + class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit isLds = 0> { dag ret = !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isLds>.ret, @@ -458,6 +474,7 @@ class MUBUF_Load_Pseudo <string opName, let Uses = !if(isLds, [EXEC, M0], [EXEC]); let has_tfe = !if(isLds, 0, 1); let lds = isLds; + let dwords = getMUBUFDwords<vdataClass>.ret; } // FIXME: tfe can't be an operand because it requires a separate @@ -521,6 +538,7 @@ class MUBUF_Store_Pseudo <string opName, let mayLoad = 0; let mayStore = 1; let maybeAtomic = 1; + let dwords = getMUBUFDwords<vdataClass>.ret; } multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, @@ -660,11 +678,10 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind, let AsmMatchConverter = "cvtMubufAtomicReturn"; } -multiclass MUBUF_Pseudo_Atomics <string opName, - RegisterClass vdataClass, - ValueType vdataType, - SDPatternOperator atomic> { - +multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName, + RegisterClass vdataClass, + ValueType vdataType, + SDPatternOperator atomic> { def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>, MUBUFAddr64Table <0, NAME>; def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>, @@ -672,7 +689,12 @@ multiclass MUBUF_Pseudo_Atomics <string opName, def _OFFEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; +} +multiclass MUBUF_Pseudo_Atomics_RTN <string opName, + RegisterClass vdataClass, + ValueType vdataType, + SDPatternOperator atomic> { def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, [(set vdataType:$vdata, (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc), @@ -690,6 +712,13 @@ multiclass MUBUF_Pseudo_Atomics <string opName, def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; } +multiclass MUBUF_Pseudo_Atomics <string opName, + RegisterClass vdataClass, + ValueType vdataType, + SDPatternOperator atomic> : + MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType, atomic>, + MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType, atomic>; + //===----------------------------------------------------------------------===// // MUBUF Instructions @@ -1030,6 +1059,14 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol", // MUBUF Patterns //===----------------------------------------------------------------------===// +def extract_glc : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i8); +}]>; + +def extract_slc : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8); +}]>; + //===----------------------------------------------------------------------===// // buffer_load/store_format patterns //===----------------------------------------------------------------------===// @@ -1037,119 +1074,129 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol", multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode> { def : GCNPat< - (vt (name v4i32:$rsrc, 0, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$glc, imm:$slc)), + (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0)), (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< - (vt (name v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$glc, imm:$slc)), - (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) + (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0)), + (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< - (vt (name v4i32:$rsrc, 0, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$glc, imm:$slc)), - (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) + (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm)), + (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< - (vt (name v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$glc, imm:$slc)), + (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm)), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; } defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, i32, "BUFFER_LOAD_FORMAT_X">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2i32, "BUFFER_LOAD_FORMAT_XY">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4i32, "BUFFER_LOAD_FORMAT_XYZW">; let SubtargetPredicate = HasUnpackedD16VMem in { defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">; + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">; + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">; + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i16, "BUFFER_LOAD_FORMAT_D16_XY">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZW">; + defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i16, "BUFFER_LOAD_FORMAT_D16_XYZW">; } // End HasPackedD16VMem. defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">; defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">; +defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i32, "BUFFER_LOAD_DWORDX4">; multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode> { def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, 0, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$glc, imm:$slc), + (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$glc, imm:$slc), - (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), 0) + (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0), + (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, + (as_i16imm $offset), (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, 0, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$glc, imm:$slc), - (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), 0) + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm), + (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, + (as_i16imm $offset), (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< - (name vt:$vdata, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$glc, imm:$slc), + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), - (as_i1imm $glc), (as_i1imm $slc), 0) + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; } defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_X">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, i32, "BUFFER_STORE_FORMAT_X">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2i32, "BUFFER_STORE_FORMAT_XY">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4i32, "BUFFER_STORE_FORMAT_XYZW">; let SubtargetPredicate = HasUnpackedD16VMem in { defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X_gfx80">; + defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X_gfx80">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XY_gfx80">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i32, "BUFFER_STORE_FORMAT_D16_XYZW_gfx80">; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem in { defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X">; + defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2f16, "BUFFER_STORE_FORMAT_D16_XY">; + defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i16, "BUFFER_STORE_FORMAT_D16_XY">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZW">; + defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i16, "BUFFER_STORE_FORMAT_D16_XYZW">; } // End HasPackedD16VMem. defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">; defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i32, "BUFFER_STORE_DWORDX4">; //===----------------------------------------------------------------------===// // buffer_atomic patterns @@ -1158,36 +1205,36 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">; multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> { def : GCNPat< (name i32:$vdata_in, v4i32:$rsrc, 0, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$slc), + 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset, - (as_i16imm $offset), (as_i1imm $slc)) + (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$slc), + 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm), (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (as_i1imm $slc)) + (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (name i32:$vdata_in, v4i32:$rsrc, 0, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$slc), + i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0), (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (as_i1imm $slc)) + (as_i16imm $offset), (extract_slc $cachepolicy)) >; def : GCNPat< (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$slc), + i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm), (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN) $vdata_in, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)) + $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)) >; } @@ -1205,49 +1252,49 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_xor, "BUFFER_ATOMIC_XOR">; def : GCNPat< (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, 0, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$slc), + 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), - $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)), sub0) >; def : GCNPat< (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), - imm:$slc), + 0, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), - $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + $vindex, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)), sub0) >; def : GCNPat< (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, 0, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$slc), + i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, 0), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), - $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + $voffset, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)), sub0) >; def : GCNPat< (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, - (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), - imm:$slc), + i32:$voffset, i32:$soffset, imm:$offset, + imm:$cachepolicy, imm), (EXTRACT_SUBREG (BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)), sub0) >; @@ -1397,54 +1444,6 @@ defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, i16, az_extloadi8_private>; defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, i16, sextloadi8_private>; } - -// BUFFER_LOAD_DWORD*, addr64=0 -multiclass MUBUF_Load_Dword <ValueType vt, - MUBUF_Pseudo offset, - MUBUF_Pseudo offen, - MUBUF_Pseudo idxen, - MUBUF_Pseudo bothen> { - - def : GCNPat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset, - imm:$offset, 0, 0, imm:$glc, imm:$slc, - imm:$tfe)), - (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), (as_i1imm $tfe)) - >; - - def : GCNPat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, - imm:$offset, 1, 0, imm:$glc, imm:$slc, - imm:$tfe)), - (offen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $tfe)) - >; - - def : GCNPat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, - imm:$offset, 0, 1, imm:$glc, imm:$slc, - imm:$tfe)), - (idxen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), - (as_i1imm $slc), (as_i1imm $tfe)) - >; - - def : GCNPat < - (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset, - imm:$offset, 1, 1, imm:$glc, imm:$slc, - imm:$tfe)), - (bothen $vaddr, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), - (as_i1imm $tfe)) - >; -} - -defm : MUBUF_Load_Dword <i32, BUFFER_LOAD_DWORD_OFFSET, BUFFER_LOAD_DWORD_OFFEN, - BUFFER_LOAD_DWORD_IDXEN, BUFFER_LOAD_DWORD_BOTHEN>; -defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_OFFEN, - BUFFER_LOAD_DWORDX2_IDXEN, BUFFER_LOAD_DWORDX2_BOTHEN>; -defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN, - BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>; - multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET, ValueType vt, PatFrag atomic_st> { // Store follows atomic op convention so address is forst @@ -1524,32 +1523,36 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode> { def : GCNPat< (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, - imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + imm:$format, imm:$cachepolicy, 0)), (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), - (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + (as_i8imm $format), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, - imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + imm:$format, imm:$cachepolicy, imm)), (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), - (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + (as_i8imm $format), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, - imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + imm:$format, imm:$cachepolicy, 0)), (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), - (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + (as_i8imm $format), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset, - imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), + imm:$format, imm:$cachepolicy, imm)), (!cast<MTBUF_Pseudo>(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), - (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + (as_i8imm $format), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; } @@ -1576,39 +1579,36 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode> { def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, - imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + imm:$format, imm:$cachepolicy, 0), (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, - (as_i16imm $offset), (as_i8imm $dfmt), - (as_i8imm $nfmt), (as_i1imm $glc), - (as_i1imm $slc), 0) + (as_i16imm $offset), (as_i8imm $format), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, - imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + imm:$format, imm:$cachepolicy, imm), (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (as_i8imm $dfmt), - (as_i8imm $nfmt), (as_i1imm $glc), - (as_i1imm $slc), 0) + (as_i16imm $offset), (as_i8imm $format), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, - imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + imm:$format, imm:$cachepolicy, 0), (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (as_i8imm $dfmt), - (as_i8imm $nfmt), (as_i1imm $glc), - (as_i1imm $slc), 0) + (as_i16imm $offset), (as_i8imm $format), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, - imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), + imm:$offset, imm:$format, imm:$cachepolicy, imm), (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), - (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) + $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), + (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0) >; } @@ -1781,8 +1781,8 @@ class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> : let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); let Inst{15} = ps.addr64; let Inst{18-16} = op; - let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value); - let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value); + let Inst{22-19} = dfmt; + let Inst{25-23} = nfmt; let Inst{31-26} = 0x3a; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); let Inst{47-40} = !if(ps.has_vdata, vdata, ?); @@ -1811,6 +1811,7 @@ defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_si <7>; //===----------------------------------------------------------------------===// // CI +// MTBUF - GFX6, GFX7. //===----------------------------------------------------------------------===// class MUBUF_Real_ci <bits<7> op, MUBUF_Pseudo ps> : @@ -2013,8 +2014,8 @@ class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> : let Inst{13} = ps.idxen; let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); let Inst{18-15} = op; - let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value); - let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value); + let Inst{22-19} = dfmt; + let Inst{25-23} = nfmt; let Inst{31-26} = 0x3a; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); let Inst{47-40} = !if(ps.has_vdata, vdata, ?); @@ -2043,8 +2044,8 @@ class MTBUF_Real_gfx80 <bits<4> op, MTBUF_Pseudo ps> : let Inst{13} = ps.idxen; let Inst{14} = !if(ps.has_glc, glc, ps.glc_value); let Inst{18-15} = op; - let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value); - let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value); + let Inst{22-19} = dfmt; + let Inst{25-23} = nfmt; let Inst{31-26} = 0x3a; //encoding let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); let Inst{47-40} = !if(ps.has_vdata, vdata, ?); @@ -2089,3 +2090,22 @@ let SubtargetPredicate = HasPackedD16VMem in { defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_vi <0x0e>; defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_vi <0x0f>; } // End HasUnpackedD16VMem. + +def MUBUFInfoTable : GenericTable { + let FilterClass = "MUBUF_Pseudo"; + let CppTypeName = "MUBUFInfo"; + let Fields = ["Opcode", "BaseOpcode", "dwords", "has_vaddr", "has_srsrc", "has_soffset"]; + + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getMUBUFOpcodeHelper"; +} + +def getMUBUFInfoFromOpcode : SearchIndex { + let Table = MUBUFInfoTable; + let Key = ["Opcode"]; +} + +def getMUBUFInfoFromBaseOpcodeAndDwords : SearchIndex { + let Table = MUBUFInfoTable; + let Key = ["BaseOpcode", "dwords"]; +} diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index 174b2df15300a..393311791ec93 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -37,8 +37,10 @@ add_llvm_target(AMDGPUCodeGen AMDGPUAnnotateUniformValues.cpp AMDGPUArgumentUsageInfo.cpp AMDGPUAsmPrinter.cpp + AMDGPUAtomicOptimizer.cpp AMDGPUCallLowering.cpp AMDGPUCodeGenPrepare.cpp + AMDGPUFixFunctionBitcasts.cpp AMDGPUFrameLowering.cpp AMDGPUHSAMetadataStreamer.cpp AMDGPUInstrInfo.cpp @@ -91,9 +93,11 @@ add_llvm_target(AMDGPUCodeGen R600OptimizeVectorRegisters.cpp R600Packetizer.cpp R600RegisterInfo.cpp + SIAddIMGInit.cpp SIAnnotateControlFlow.cpp SIDebuggerInsertNops.cpp SIFixSGPRCopies.cpp + SIFixupVectorISel.cpp SIFixVGPRCopies.cpp SIFixWWMLiveness.cpp SIFoldOperands.cpp @@ -116,6 +120,8 @@ add_llvm_target(AMDGPUCodeGen SIShrinkInstructions.cpp SIWholeQuadMode.cpp GCNILPSched.cpp + GCNDPPCombine.cpp + SIModeRegister.cpp ) add_subdirectory(AsmParser) diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td index cdc6ab9412e61..31d2ebef481d2 100644 --- a/lib/Target/AMDGPU/DSInstructions.td +++ b/lib/Target/AMDGPU/DSInstructions.td @@ -728,7 +728,9 @@ class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, PatFrag frag> : GCNPat< (i1 0)) >; -let OtherPredicates = [LDSRequiresM0Init] in { +// v2i32 loads are split into i32 loads on SI during lowering, due to a bug +// related to bounds checking. +let OtherPredicates = [LDSRequiresM0Init, isCIVI] in { def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, load_local_m0>; def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, store_local_m0>; } diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td index 3ef473b7fd966..44040d352e6a8 100644 --- a/lib/Target/AMDGPU/FLATInstructions.td +++ b/lib/Target/AMDGPU/FLATInstructions.td @@ -121,6 +121,11 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : let Inst{63-56} = !if(ps.has_vdst, vdst, ?); } +class GlobalSaddrTable <bit is_saddr, string Name = ""> { + bit IsSaddr = is_saddr; + string SaddrOp = Name; +} + // TODO: Is exec allowed for saddr? The disabled value 0x7f is the // same encoding value as exec_hi, so it isn't possible to use that if // saddr is 32-bit (which isn't handled here yet). @@ -171,15 +176,19 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> { let is_flat_global = 1 in { - def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>; - def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>; + def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>, + GlobalSaddrTable<1, opName>; } } multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> { let is_flat_global = 1 in { - def "" : FLAT_Store_Pseudo<opName, regClass, 1, 1>; - def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1, 1>; + def "" : FLAT_Store_Pseudo<opName, regClass, 1, 1>, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1, 1>, + GlobalSaddrTable<1, opName>; } } @@ -262,6 +271,7 @@ multiclass FLAT_Atomic_Pseudo< (outs), (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, SLC:$slc), " $vaddr, $vdata$offset$slc">, + GlobalSaddrTable<0, opName>, AtomicNoRet <opName, 0> { let PseudoInstr = NAME; } @@ -272,10 +282,11 @@ multiclass FLAT_Atomic_Pseudo< " $vdst, $vaddr, $vdata$offset glc$slc", [(set vt:$vdst, (atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>, + GlobalSaddrTable<0, opName#"_rtn">, AtomicNoRet <opName, 1>; } -multiclass FLAT_Global_Atomic_Pseudo< +multiclass FLAT_Global_Atomic_Pseudo_NO_RTN< string opName, RegisterClass vdst_rc, ValueType vt, @@ -287,35 +298,48 @@ multiclass FLAT_Global_Atomic_Pseudo< (outs), (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc), " $vaddr, $vdata, off$offset$slc">, + GlobalSaddrTable<0, opName>, AtomicNoRet <opName, 0> { let has_saddr = 1; let PseudoInstr = NAME; } - def _RTN : FLAT_AtomicRet_Pseudo <opName, - (outs vdst_rc:$vdst), - (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc), - " $vdst, $vaddr, $vdata, off$offset glc$slc", - [(set vt:$vdst, - (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>, - AtomicNoRet <opName, 1> { - let has_saddr = 1; - } - def _SADDR : FLAT_AtomicNoRet_Pseudo <opName, (outs), (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc), " $vaddr, $vdata, $saddr$offset$slc">, + GlobalSaddrTable<1, opName>, AtomicNoRet <opName#"_saddr", 0> { let has_saddr = 1; let enabled_saddr = 1; let PseudoInstr = NAME#"_SADDR"; } +} + +multiclass FLAT_Global_Atomic_Pseudo_RTN< + string opName, + RegisterClass vdst_rc, + ValueType vt, + SDPatternOperator atomic = null_frag, + ValueType data_vt = vt, + RegisterClass data_rc = vdst_rc> { + + def _RTN : FLAT_AtomicRet_Pseudo <opName, + (outs vdst_rc:$vdst), + (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc), + " $vdst, $vaddr, $vdata, off$offset glc$slc", + [(set vt:$vdst, + (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>, + GlobalSaddrTable<0, opName#"_rtn">, + AtomicNoRet <opName, 1> { + let has_saddr = 1; + } def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName, (outs vdst_rc:$vdst), (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc), " $vdst, $vaddr, $vdata, $saddr$offset glc$slc">, + GlobalSaddrTable<1, opName#"_rtn">, AtomicNoRet <opName#"_saddr", 1> { let has_saddr = 1; let enabled_saddr = 1; @@ -323,10 +347,20 @@ multiclass FLAT_Global_Atomic_Pseudo< } } +multiclass FLAT_Global_Atomic_Pseudo< + string opName, + RegisterClass vdst_rc, + ValueType vt, + SDPatternOperator atomic = null_frag, + ValueType data_vt = vt, + RegisterClass data_rc = vdst_rc> : + FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, atomic, data_vt, data_rc>, + FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, atomic, data_vt, data_rc>; + class flat_binary_atomic_op<SDNode atomic_op> : PatFrag< (ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.FLAT_ADDRESS;}] + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;}] >; def atomic_cmp_swap_flat : flat_binary_atomic_op<AMDGPUatomic_cmp_swap>; diff --git a/lib/Target/AMDGPU/GCNDPPCombine.cpp b/lib/Target/AMDGPU/GCNDPPCombine.cpp new file mode 100644 index 0000000000000..56071d0d23744 --- /dev/null +++ b/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -0,0 +1,446 @@ +//=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0 +// operand.If any of the use instruction cannot be combined with the mov the +// whole sequence is reverted. +// +// $old = ... +// $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane, +// dpp_controls..., $bound_ctrl +// $res = VALU $dpp_value, ... +// +// to +// +// $res = VALU_DPP $folded_old, $vgpr_to_be_read_from_other_lane, ..., +// dpp_controls..., $folded_bound_ctrl +// +// Combining rules : +// +// $bound_ctrl is DPP_BOUND_ZERO, $old is any +// $bound_ctrl is DPP_BOUND_OFF, $old is 0 +// +// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_ZERO +// $bound_ctrl is DPP_BOUND_OFF, $old is undef +// +// ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_OFF +// $bound_ctrl is DPP_BOUND_OFF, $old is foldable +// +// ->$folded_old = folded value, $folded_bound_ctrl = DPP_BOUND_OFF +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/Pass.h" +#include <cassert> + +using namespace llvm; + +#define DEBUG_TYPE "gcn-dpp-combine" + +STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined."); + +namespace { + +class GCNDPPCombine : public MachineFunctionPass { + MachineRegisterInfo *MRI; + const SIInstrInfo *TII; + + using RegSubRegPair = TargetInstrInfo::RegSubRegPair; + + MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const; + + RegSubRegPair foldOldOpnd(MachineInstr &OrigMI, + RegSubRegPair OldOpndVGPR, + MachineOperand &OldOpndValue) const; + + MachineInstr *createDPPInst(MachineInstr &OrigMI, + MachineInstr &MovMI, + RegSubRegPair OldOpndVGPR, + MachineOperand *OldOpnd, + bool BoundCtrlZero) const; + + MachineInstr *createDPPInst(MachineInstr &OrigMI, + MachineInstr &MovMI, + RegSubRegPair OldOpndVGPR, + bool BoundCtrlZero) const; + + bool hasNoImmOrEqual(MachineInstr &MI, + unsigned OpndName, + int64_t Value, + int64_t Mask = -1) const; + + bool combineDPPMov(MachineInstr &MI) const; + +public: + static char ID; + + GCNDPPCombine() : MachineFunctionPass(ID) { + initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "GCN DPP Combine"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // end anonymous namespace + +INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine", false, false) + +char GCNDPPCombine::ID = 0; + +char &llvm::GCNDPPCombineID = GCNDPPCombine::ID; + +FunctionPass *llvm::createGCNDPPCombinePass() { + return new GCNDPPCombine(); +} + +static int getDPPOp(unsigned Op) { + auto DPP32 = AMDGPU::getDPPOp32(Op); + if (DPP32 != -1) + return DPP32; + + auto E32 = AMDGPU::getVOPe32(Op); + return E32 != -1 ? AMDGPU::getDPPOp32(E32) : -1; +} + +// tracks the register operand definition and returns: +// 1. immediate operand used to initialize the register if found +// 2. nullptr if the register operand is undef +// 3. the operand itself otherwise +MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const { + auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI); + if (!Def) + return nullptr; + + switch(Def->getOpcode()) { + default: break; + case AMDGPU::IMPLICIT_DEF: + return nullptr; + case AMDGPU::COPY: + case AMDGPU::V_MOV_B32_e32: { + auto &Op1 = Def->getOperand(1); + if (Op1.isImm()) + return &Op1; + break; + } + } + return &OldOpnd; +} + +MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, + MachineInstr &MovMI, + RegSubRegPair OldOpndVGPR, + bool BoundCtrlZero) const { + assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); + assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() == + TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg()); + + auto OrigOp = OrigMI.getOpcode(); + auto DPPOp = getDPPOp(OrigOp); + if (DPPOp == -1) { + LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n"); + return nullptr; + } + + auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI, + OrigMI.getDebugLoc(), TII->get(DPPOp)); + bool Fail = false; + do { + auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst); + assert(Dst); + DPPInst.add(*Dst); + int NumOperands = 1; + + const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old); + if (OldIdx != -1) { + assert(OldIdx == NumOperands); + assert(isOfRegClass(OldOpndVGPR, AMDGPU::VGPR_32RegClass, *MRI)); + DPPInst.addReg(OldOpndVGPR.Reg, 0, OldOpndVGPR.SubReg); + ++NumOperands; + } + + if (auto *Mod0 = TII->getNamedOperand(OrigMI, + AMDGPU::OpName::src0_modifiers)) { + assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, + AMDGPU::OpName::src0_modifiers)); + assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); + DPPInst.addImm(Mod0->getImm()); + ++NumOperands; + } + auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0); + assert(Src0); + if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) { + LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n"); + Fail = true; + break; + } + DPPInst.add(*Src0); + ++NumOperands; + + if (auto *Mod1 = TII->getNamedOperand(OrigMI, + AMDGPU::OpName::src1_modifiers)) { + assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp, + AMDGPU::OpName::src1_modifiers)); + assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG))); + DPPInst.addImm(Mod1->getImm()); + ++NumOperands; + } + if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { + if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) { + LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n"); + Fail = true; + break; + } + DPPInst.add(*Src1); + ++NumOperands; + } + + if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) { + if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) { + LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n"); + Fail = true; + break; + } + DPPInst.add(*Src2); + } + + DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl)); + DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask)); + DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask)); + DPPInst.addImm(BoundCtrlZero ? 1 : 0); + } while (false); + + if (Fail) { + DPPInst.getInstr()->eraseFromParent(); + return nullptr; + } + LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr()); + return DPPInst.getInstr(); +} + +GCNDPPCombine::RegSubRegPair +GCNDPPCombine::foldOldOpnd(MachineInstr &OrigMI, + RegSubRegPair OldOpndVGPR, + MachineOperand &OldOpndValue) const { + assert(OldOpndValue.isImm()); + switch (OrigMI.getOpcode()) { + default: break; + case AMDGPU::V_MAX_U32_e32: + if (OldOpndValue.getImm() == std::numeric_limits<uint32_t>::max()) + return OldOpndVGPR; + break; + case AMDGPU::V_MAX_I32_e32: + if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::max()) + return OldOpndVGPR; + break; + case AMDGPU::V_MIN_I32_e32: + if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::min()) + return OldOpndVGPR; + break; + + case AMDGPU::V_MUL_I32_I24_e32: + case AMDGPU::V_MUL_U32_U24_e32: + if (OldOpndValue.getImm() == 1) { + auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1); + assert(Src1 && Src1->isReg()); + return getRegSubRegPair(*Src1); + } + break; + } + return RegSubRegPair(); +} + +// Cases to combine: +// $bound_ctrl is DPP_BOUND_ZERO, $old is any +// $bound_ctrl is DPP_BOUND_OFF, $old is 0 +// -> $old = undef, $bound_ctrl = DPP_BOUND_ZERO + +// $bound_ctrl is DPP_BOUND_OFF, $old is undef +// -> $old = undef, $bound_ctrl = DPP_BOUND_OFF + +// $bound_ctrl is DPP_BOUND_OFF, $old is foldable +// -> $old = folded value, $bound_ctrl = DPP_BOUND_OFF + +MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, + MachineInstr &MovMI, + RegSubRegPair OldOpndVGPR, + MachineOperand *OldOpndValue, + bool BoundCtrlZero) const { + assert(OldOpndVGPR.Reg); + if (!BoundCtrlZero && OldOpndValue) { + assert(OldOpndValue->isImm()); + OldOpndVGPR = foldOldOpnd(OrigMI, OldOpndVGPR, *OldOpndValue); + if (!OldOpndVGPR.Reg) { + LLVM_DEBUG(dbgs() << " failed: old immediate cannot be folded\n"); + return nullptr; + } + } + return createDPPInst(OrigMI, MovMI, OldOpndVGPR, BoundCtrlZero); +} + +// returns true if MI doesn't have OpndName immediate operand or the +// operand has Value +bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName, + int64_t Value, int64_t Mask) const { + auto *Imm = TII->getNamedOperand(MI, OpndName); + if (!Imm) + return true; + + assert(Imm->isImm()); + return (Imm->getImm() & Mask) == Value; +} + +bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const { + assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp); + auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl); + assert(BCZOpnd && BCZOpnd->isImm()); + bool BoundCtrlZero = 0 != BCZOpnd->getImm(); + + LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI); + + auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old); + assert(OldOpnd && OldOpnd->isReg()); + auto OldOpndVGPR = getRegSubRegPair(*OldOpnd); + auto *OldOpndValue = getOldOpndValue(*OldOpnd); + assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd); + if (OldOpndValue) { + if (BoundCtrlZero) { + OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef, ignore old opnd + OldOpndValue = nullptr; + } else { + if (!OldOpndValue->isImm()) { + LLVM_DEBUG(dbgs() << " failed: old operand isn't an imm or undef\n"); + return false; + } + if (OldOpndValue->getImm() == 0) { + OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef + OldOpndValue = nullptr; + BoundCtrlZero = true; + } + } + } + + LLVM_DEBUG(dbgs() << " old="; + if (!OldOpndValue) + dbgs() << "undef"; + else + dbgs() << OldOpndValue->getImm(); + dbgs() << ", bound_ctrl=" << BoundCtrlZero << '\n'); + + std::vector<MachineInstr*> OrigMIs, DPPMIs; + if (!OldOpndVGPR.Reg) { // OldOpndVGPR = undef + OldOpndVGPR = RegSubRegPair( + MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass)); + auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(), + TII->get(AMDGPU::IMPLICIT_DEF), OldOpndVGPR.Reg); + DPPMIs.push_back(UndefInst.getInstr()); + } + + OrigMIs.push_back(&MovMI); + bool Rollback = true; + for (auto &Use : MRI->use_nodbg_operands( + TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg())) { + Rollback = true; + + auto &OrigMI = *Use.getParent(); + auto OrigOp = OrigMI.getOpcode(); + if (TII->isVOP3(OrigOp)) { + if (!TII->hasVALU32BitEncoding(OrigOp)) { + LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n"); + break; + } + // check if other than abs|neg modifiers are set (opsel for example) + const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG); + if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) || + !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) || + !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) || + !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) { + LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n"); + break; + } + } else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) { + LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n"); + break; + } + + LLVM_DEBUG(dbgs() << " combining: " << OrigMI); + if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) { + if (auto *DPPInst = createDPPInst(OrigMI, MovMI, OldOpndVGPR, + OldOpndValue, BoundCtrlZero)) { + DPPMIs.push_back(DPPInst); + Rollback = false; + } + } else if (OrigMI.isCommutable() && + &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) { + auto *BB = OrigMI.getParent(); + auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI); + BB->insert(OrigMI, NewMI); + if (TII->commuteInstruction(*NewMI)) { + LLVM_DEBUG(dbgs() << " commuted: " << *NewMI); + if (auto *DPPInst = createDPPInst(*NewMI, MovMI, OldOpndVGPR, + OldOpndValue, BoundCtrlZero)) { + DPPMIs.push_back(DPPInst); + Rollback = false; + } + } else + LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n"); + NewMI->eraseFromParent(); + } else + LLVM_DEBUG(dbgs() << " failed: no suitable operands\n"); + if (Rollback) + break; + OrigMIs.push_back(&OrigMI); + } + + for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs)) + MI->eraseFromParent(); + + return !Rollback; +} + +bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) { + auto &ST = MF.getSubtarget<GCNSubtarget>(); + if (!ST.hasDPP() || skipFunction(MF.getFunction())) + return false; + + MRI = &MF.getRegInfo(); + TII = ST.getInstrInfo(); + + assert(MRI->isSSA() && "Must be run on SSA"); + + bool Changed = false; + for (auto &MBB : MF) { + for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) { + auto &MI = *I++; + if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) { + Changed = true; + ++NumDPPMovsCombined; + } + } + } + return Changed; +} diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index f236f10ba75ab..c6396de89c4f6 100644 --- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -215,6 +215,14 @@ void GCNHazardRecognizer::AdvanceCycle() { if (!CurrCycleInstr) return; + // Do not track non-instructions which do not affect the wait states. + // If included, these instructions can lead to buffer overflow such that + // detectable hazards are missed. + if (CurrCycleInstr->getOpcode() == AMDGPU::IMPLICIT_DEF) + return; + else if (CurrCycleInstr->isDebugInstr()) + return; + unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr); // Keep track of emitted instructions @@ -253,8 +261,7 @@ int GCNHazardRecognizer::getWaitStatesSince( return WaitStates; unsigned Opcode = MI->getOpcode(); - if (Opcode == AMDGPU::DBG_VALUE || Opcode == AMDGPU::IMPLICIT_DEF || - Opcode == AMDGPU::INLINEASM) + if (Opcode == AMDGPU::INLINEASM) continue; } ++WaitStates; diff --git a/lib/Target/AMDGPU/GCNILPSched.cpp b/lib/Target/AMDGPU/GCNILPSched.cpp index 651091d441364..d62dc8d86781c 100644 --- a/lib/Target/AMDGPU/GCNILPSched.cpp +++ b/lib/Target/AMDGPU/GCNILPSched.cpp @@ -335,7 +335,7 @@ GCNILPScheduler::schedule(ArrayRef<const SUnit*> BotRoots, assert(C); AvailQueue.remove(*C); auto SU = C->SU; - LLVM_DEBUG(dbgs() << "Selected "; SU->dump(&DAG)); + LLVM_DEBUG(dbgs() << "Selected "; DAG.dumpNode(*SU)); advanceToCycle(SU->getHeight()); diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 15366d66bd852..8e4cc391dc21c 100644 --- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -434,8 +434,7 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule, // Sort recorded regions by pressure - highest at the front void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) { const auto &ST = MF.getSubtarget<GCNSubtarget>(); - llvm::sort(Regions.begin(), Regions.end(), - [&ST, TargetOcc](const Region *R1, const Region *R2) { + llvm::sort(Regions, [&ST, TargetOcc](const Region *R1, const Region *R2) { return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc); }); } diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp index 192d534bb9cfd..ec6bcae335551 100644 --- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp +++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp @@ -258,7 +258,7 @@ GCNMinRegScheduler::schedule(ArrayRef<const SUnit*> TopRoots, assert(C); RQ.remove(*C); auto SU = C->SU; - LLVM_DEBUG(dbgs() << "Selected "; SU->dump(&DAG)); + LLVM_DEBUG(dbgs() << "Selected "; DAG.dumpNode(*SU)); releaseSuccessors(SU, StepNo); Schedule.push_back(SU); diff --git a/lib/Target/AMDGPU/GCNProcessors.td b/lib/Target/AMDGPU/GCNProcessors.td index d76acfa24f901..b8142a4e4ff88 100644 --- a/lib/Target/AMDGPU/GCNProcessors.td +++ b/lib/Target/AMDGPU/GCNProcessors.td @@ -156,3 +156,8 @@ def : ProcessorModel<"gfx904", SIQuarterSpeedModel, def : ProcessorModel<"gfx906", SIQuarterSpeedModel, [FeatureISAVersion9_0_6] >; + +def : ProcessorModel<"gfx909", SIQuarterSpeedModel, + [FeatureISAVersion9_0_9] +>; + diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index db908368a1791..fab0f87dfcbea 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -207,9 +207,12 @@ void AMDGPUInstPrinter::printDA(const MCInst *MI, unsigned OpNo, printNamedBit(MI, OpNo, O, "da"); } -void AMDGPUInstPrinter::printR128(const MCInst *MI, unsigned OpNo, +void AMDGPUInstPrinter::printR128A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - printNamedBit(MI, OpNo, O, "r128"); + if (STI.hasFeature(AMDGPU::FeatureR128A16)) + printNamedBit(MI, OpNo, O, "a16"); + else + printNamedBit(MI, OpNo, O, "r128"); } void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo, @@ -236,21 +239,12 @@ void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo, O << " vm"; } -void AMDGPUInstPrinter::printDFMT(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) { - O << " dfmt:"; - printU8ImmDecOperand(MI, OpNo, O); - } -} - -void AMDGPUInstPrinter::printNFMT(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) { - O << " nfmt:"; - printU8ImmDecOperand(MI, OpNo, O); +void AMDGPUInstPrinter::printFORMAT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (unsigned Val = MI->getOperand(OpNo).getImm()) { + O << " dfmt:" << (Val & 15); + O << ", nfmt:" << (Val >> 4); } } @@ -1161,8 +1155,7 @@ void AMDGPUInstPrinter::printSwizzle(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - AMDGPU::IsaInfo::IsaVersion ISA = - AMDGPU::IsaInfo::getIsaVersion(STI.getFeatureBits()); + AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI.getCPU()); unsigned SImm16 = MI->getOperand(OpNo).getImm(); unsigned Vmcnt, Expcnt, Lgkmcnt; diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h index 11a496a38b2cd..0ba74ca0f3e19 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -80,7 +80,7 @@ private: raw_ostream &O); void printDA(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printR128(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + void printR128A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printLWE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); @@ -90,10 +90,8 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printExpVM(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printDFMT(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); - void printNFMT(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O); + void printFORMAT(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printRegOperand(unsigned RegNo, raw_ostream &O); void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, diff --git a/lib/Target/AMDGPU/LLVMBuild.txt b/lib/Target/AMDGPU/LLVMBuild.txt index c54a13c4b4d88..e591d756a545e 100644 --- a/lib/Target/AMDGPU/LLVMBuild.txt +++ b/lib/Target/AMDGPU/LLVMBuild.txt @@ -30,5 +30,5 @@ has_disassembler = 1 type = Library name = AMDGPUCodeGen parent = AMDGPU -required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize GlobalISel +required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize GlobalISel BinaryFormat add_to_library_groups = AMDGPU diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 07bef9103c0d8..c85a1ea5b0549 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -46,11 +46,9 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, if (const auto *SymA = Target.getSymA()) { // SCRATCH_RSRC_DWORD[01] is a special global variable that represents // the scratch buffer. - if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD0") + if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD0" || + SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD1") return ELF::R_AMDGPU_ABS32_LO; - - if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD1") - return ELF::R_AMDGPU_ABS32_HI; } switch (Target.getAccessVariant()) { diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 6a41e3f650bc2..c17fe126546ce 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -17,7 +17,9 @@ #include "Utils/AMDGPUBaseInfo.h" #include "Utils/AMDKernelCodeTUtils.h" #include "llvm/ADT/Twine.h" +#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h" #include "llvm/BinaryFormat/ELF.h" +#include "llvm/BinaryFormat/MsgPackTypes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Metadata.h" @@ -27,6 +29,7 @@ #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/Support/FormattedStream.h" +#include "llvm/Support/TargetParser.h" namespace llvm { #include "AMDGPUPTNote.h" @@ -34,95 +37,116 @@ namespace llvm { using namespace llvm; using namespace llvm::AMDGPU; +using namespace llvm::AMDGPU::HSAMD; //===----------------------------------------------------------------------===// // AMDGPUTargetStreamer //===----------------------------------------------------------------------===// -static const struct { - const char *Name; - unsigned Mach; -} MachTable[] = { - // Radeon HD 2000/3000 Series (R600). - { "r600", ELF::EF_AMDGPU_MACH_R600_R600 }, - { "r630", ELF::EF_AMDGPU_MACH_R600_R630 }, - { "rs880", ELF::EF_AMDGPU_MACH_R600_RS880 }, - { "rv670", ELF::EF_AMDGPU_MACH_R600_RV670 }, - // Radeon HD 4000 Series (R700). - { "rv710", ELF::EF_AMDGPU_MACH_R600_RV710 }, - { "rv730", ELF::EF_AMDGPU_MACH_R600_RV730 }, - { "rv770", ELF::EF_AMDGPU_MACH_R600_RV770 }, - // Radeon HD 5000 Series (Evergreen). - { "cedar", ELF::EF_AMDGPU_MACH_R600_CEDAR }, - { "cypress", ELF::EF_AMDGPU_MACH_R600_CYPRESS }, - { "juniper", ELF::EF_AMDGPU_MACH_R600_JUNIPER }, - { "redwood", ELF::EF_AMDGPU_MACH_R600_REDWOOD }, - { "sumo", ELF::EF_AMDGPU_MACH_R600_SUMO }, - // Radeon HD 6000 Series (Northern Islands). - { "barts", ELF::EF_AMDGPU_MACH_R600_BARTS }, - { "caicos", ELF::EF_AMDGPU_MACH_R600_CAICOS }, - { "cayman", ELF::EF_AMDGPU_MACH_R600_CAYMAN }, - { "turks", ELF::EF_AMDGPU_MACH_R600_TURKS }, - // AMDGCN GFX6. - { "gfx600", ELF::EF_AMDGPU_MACH_AMDGCN_GFX600 }, - { "tahiti", ELF::EF_AMDGPU_MACH_AMDGCN_GFX600 }, - { "gfx601", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 }, - { "hainan", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 }, - { "oland", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 }, - { "pitcairn", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 }, - { "verde", ELF::EF_AMDGPU_MACH_AMDGCN_GFX601 }, - // AMDGCN GFX7. - { "gfx700", ELF::EF_AMDGPU_MACH_AMDGCN_GFX700 }, - { "kaveri", ELF::EF_AMDGPU_MACH_AMDGCN_GFX700 }, - { "gfx701", ELF::EF_AMDGPU_MACH_AMDGCN_GFX701 }, - { "hawaii", ELF::EF_AMDGPU_MACH_AMDGCN_GFX701 }, - { "gfx702", ELF::EF_AMDGPU_MACH_AMDGCN_GFX702 }, - { "gfx703", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 }, - { "kabini", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 }, - { "mullins", ELF::EF_AMDGPU_MACH_AMDGCN_GFX703 }, - { "gfx704", ELF::EF_AMDGPU_MACH_AMDGCN_GFX704 }, - { "bonaire", ELF::EF_AMDGPU_MACH_AMDGCN_GFX704 }, - // AMDGCN GFX8. - { "gfx801", ELF::EF_AMDGPU_MACH_AMDGCN_GFX801 }, - { "carrizo", ELF::EF_AMDGPU_MACH_AMDGCN_GFX801 }, - { "gfx802", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 }, - { "iceland", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 }, - { "tonga", ELF::EF_AMDGPU_MACH_AMDGCN_GFX802 }, - { "gfx803", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 }, - { "fiji", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 }, - { "polaris10", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 }, - { "polaris11", ELF::EF_AMDGPU_MACH_AMDGCN_GFX803 }, - { "gfx810", ELF::EF_AMDGPU_MACH_AMDGCN_GFX810 }, - { "stoney", ELF::EF_AMDGPU_MACH_AMDGCN_GFX810 }, - // AMDGCN GFX9. - { "gfx900", ELF::EF_AMDGPU_MACH_AMDGCN_GFX900 }, - { "gfx902", ELF::EF_AMDGPU_MACH_AMDGCN_GFX902 }, - { "gfx904", ELF::EF_AMDGPU_MACH_AMDGCN_GFX904 }, - { "gfx906", ELF::EF_AMDGPU_MACH_AMDGCN_GFX906 }, - // Not specified processor. - { nullptr, ELF::EF_AMDGPU_MACH_NONE } -}; +bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) { + HSAMD::Metadata HSAMetadata; + if (HSAMD::fromString(HSAMetadataString, HSAMetadata)) + return false; -unsigned AMDGPUTargetStreamer::getMACH(StringRef GPU) const { - auto Entry = MachTable; - for (; Entry->Name && GPU != Entry->Name; ++Entry) - ; - return Entry->Mach; + return EmitHSAMetadata(HSAMetadata); } -const char *AMDGPUTargetStreamer::getMachName(unsigned Mach) { - auto Entry = MachTable; - for (; Entry->Name && Mach != Entry->Mach; ++Entry) - ; - return Entry->Name; +bool AMDGPUTargetStreamer::EmitHSAMetadataV3(StringRef HSAMetadataString) { + std::shared_ptr<msgpack::Node> HSAMetadataRoot; + yaml::Input YIn(HSAMetadataString); + YIn >> HSAMetadataRoot; + if (YIn.error()) + return false; + return EmitHSAMetadata(HSAMetadataRoot, false); } -bool AMDGPUTargetStreamer::EmitHSAMetadata(StringRef HSAMetadataString) { - HSAMD::Metadata HSAMetadata; - if (HSAMD::fromString(HSAMetadataString, HSAMetadata)) - return false; +StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { + AMDGPU::GPUKind AK; - return EmitHSAMetadata(HSAMetadata); + switch (ElfMach) { + case ELF::EF_AMDGPU_MACH_R600_R600: AK = GK_R600; break; + case ELF::EF_AMDGPU_MACH_R600_R630: AK = GK_R630; break; + case ELF::EF_AMDGPU_MACH_R600_RS880: AK = GK_RS880; break; + case ELF::EF_AMDGPU_MACH_R600_RV670: AK = GK_RV670; break; + case ELF::EF_AMDGPU_MACH_R600_RV710: AK = GK_RV710; break; + case ELF::EF_AMDGPU_MACH_R600_RV730: AK = GK_RV730; break; + case ELF::EF_AMDGPU_MACH_R600_RV770: AK = GK_RV770; break; + case ELF::EF_AMDGPU_MACH_R600_CEDAR: AK = GK_CEDAR; break; + case ELF::EF_AMDGPU_MACH_R600_CYPRESS: AK = GK_CYPRESS; break; + case ELF::EF_AMDGPU_MACH_R600_JUNIPER: AK = GK_JUNIPER; break; + case ELF::EF_AMDGPU_MACH_R600_REDWOOD: AK = GK_REDWOOD; break; + case ELF::EF_AMDGPU_MACH_R600_SUMO: AK = GK_SUMO; break; + case ELF::EF_AMDGPU_MACH_R600_BARTS: AK = GK_BARTS; break; + case ELF::EF_AMDGPU_MACH_R600_CAICOS: AK = GK_CAICOS; break; + case ELF::EF_AMDGPU_MACH_R600_CAYMAN: AK = GK_CAYMAN; break; + case ELF::EF_AMDGPU_MACH_R600_TURKS: AK = GK_TURKS; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600: AK = GK_GFX600; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601: AK = GK_GFX601; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: AK = GK_GFX700; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: AK = GK_GFX701; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: AK = GK_GFX702; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: AK = GK_GFX703; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: AK = GK_GFX704; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: AK = GK_GFX801; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: AK = GK_GFX802; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: AK = GK_GFX803; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: AK = GK_GFX810; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: AK = GK_GFX900; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break; + case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break; + } + + StringRef GPUName = getArchNameAMDGCN(AK); + if (GPUName != "") + return GPUName; + return getArchNameR600(AK); +} + +unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { + AMDGPU::GPUKind AK = parseArchAMDGCN(GPU); + if (AK == AMDGPU::GPUKind::GK_NONE) + AK = parseArchR600(GPU); + + switch (AK) { + case GK_R600: return ELF::EF_AMDGPU_MACH_R600_R600; + case GK_R630: return ELF::EF_AMDGPU_MACH_R600_R630; + case GK_RS880: return ELF::EF_AMDGPU_MACH_R600_RS880; + case GK_RV670: return ELF::EF_AMDGPU_MACH_R600_RV670; + case GK_RV710: return ELF::EF_AMDGPU_MACH_R600_RV710; + case GK_RV730: return ELF::EF_AMDGPU_MACH_R600_RV730; + case GK_RV770: return ELF::EF_AMDGPU_MACH_R600_RV770; + case GK_CEDAR: return ELF::EF_AMDGPU_MACH_R600_CEDAR; + case GK_CYPRESS: return ELF::EF_AMDGPU_MACH_R600_CYPRESS; + case GK_JUNIPER: return ELF::EF_AMDGPU_MACH_R600_JUNIPER; + case GK_REDWOOD: return ELF::EF_AMDGPU_MACH_R600_REDWOOD; + case GK_SUMO: return ELF::EF_AMDGPU_MACH_R600_SUMO; + case GK_BARTS: return ELF::EF_AMDGPU_MACH_R600_BARTS; + case GK_CAICOS: return ELF::EF_AMDGPU_MACH_R600_CAICOS; + case GK_CAYMAN: return ELF::EF_AMDGPU_MACH_R600_CAYMAN; + case GK_TURKS: return ELF::EF_AMDGPU_MACH_R600_TURKS; + case GK_GFX600: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX600; + case GK_GFX601: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX601; + case GK_GFX700: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX700; + case GK_GFX701: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX701; + case GK_GFX702: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX702; + case GK_GFX703: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX703; + case GK_GFX704: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX704; + case GK_GFX801: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX801; + case GK_GFX802: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX802; + case GK_GFX803: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX803; + case GK_GFX810: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX810; + case GK_GFX900: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX900; + case GK_GFX902: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX902; + case GK_GFX904: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX904; + case GK_GFX906: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906; + case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909; + case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE; + } + + llvm_unreachable("unknown GPU"); } //===----------------------------------------------------------------------===// @@ -183,9 +207,26 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata( if (HSAMD::toString(HSAMetadata, HSAMetadataString)) return false; - OS << '\t' << HSAMD::AssemblerDirectiveBegin << '\n'; + OS << '\t' << AssemblerDirectiveBegin << '\n'; OS << HSAMetadataString << '\n'; - OS << '\t' << HSAMD::AssemblerDirectiveEnd << '\n'; + OS << '\t' << AssemblerDirectiveEnd << '\n'; + return true; +} + +bool AMDGPUTargetAsmStreamer::EmitHSAMetadata( + std::shared_ptr<msgpack::Node> &HSAMetadataRoot, bool Strict) { + V3::MetadataVerifier Verifier(Strict); + if (!Verifier.verify(*HSAMetadataRoot)) + return false; + + std::string HSAMetadataString; + raw_string_ostream StrOS(HSAMetadataString); + yaml::Output YOut(StrOS); + YOut << HSAMetadataRoot; + + OS << '\t' << V3::AssemblerDirectiveBegin << '\n'; + OS << StrOS.str() << '\n'; + OS << '\t' << V3::AssemblerDirectiveEnd << '\n'; return true; } @@ -203,70 +244,59 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, bool ReserveXNACK) { - amdhsa::kernel_descriptor_t DefaultKD = getDefaultAmdhsaKernelDescriptor(); - - IsaInfo::IsaVersion IVersion = IsaInfo::getIsaVersion(STI.getFeatureBits()); + IsaVersion IVersion = getIsaVersion(STI.getCPU()); OS << "\t.amdhsa_kernel " << KernelName << '\n'; -#define PRINT_IF_NOT_DEFAULT(STREAM, DIRECTIVE, KERNEL_DESC, \ - DEFAULT_KERNEL_DESC, MEMBER_NAME, FIELD_NAME) \ - if (AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) != \ - AMDHSA_BITS_GET(DEFAULT_KERNEL_DESC.MEMBER_NAME, FIELD_NAME)) \ - STREAM << "\t\t" << DIRECTIVE << " " \ - << AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) << '\n'; +#define PRINT_FIELD(STREAM, DIRECTIVE, KERNEL_DESC, MEMBER_NAME, FIELD_NAME) \ + STREAM << "\t\t" << DIRECTIVE << " " \ + << AMDHSA_BITS_GET(KERNEL_DESC.MEMBER_NAME, FIELD_NAME) << '\n'; - if (KD.group_segment_fixed_size != DefaultKD.group_segment_fixed_size) - OS << "\t\t.amdhsa_group_segment_fixed_size " << KD.group_segment_fixed_size - << '\n'; - if (KD.private_segment_fixed_size != DefaultKD.private_segment_fixed_size) - OS << "\t\t.amdhsa_private_segment_fixed_size " - << KD.private_segment_fixed_size << '\n'; + OS << "\t\t.amdhsa_group_segment_fixed_size " << KD.group_segment_fixed_size + << '\n'; + OS << "\t\t.amdhsa_private_segment_fixed_size " + << KD.private_segment_fixed_size << '\n'; - PRINT_IF_NOT_DEFAULT( - OS, ".amdhsa_user_sgpr_private_segment_buffer", KD, DefaultKD, - kernel_code_properties, - amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER); - PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD, DefaultKD, - kernel_code_properties, - amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR); - PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_queue_ptr", KD, DefaultKD, - kernel_code_properties, - amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR); - PRINT_IF_NOT_DEFAULT( - OS, ".amdhsa_user_sgpr_kernarg_segment_ptr", KD, DefaultKD, - kernel_code_properties, - amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR); - PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_user_sgpr_dispatch_id", KD, DefaultKD, - kernel_code_properties, - amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID); - PRINT_IF_NOT_DEFAULT( - OS, ".amdhsa_user_sgpr_flat_scratch_init", KD, DefaultKD, - kernel_code_properties, - amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); - PRINT_IF_NOT_DEFAULT( - OS, ".amdhsa_user_sgpr_private_segment_size", KD, DefaultKD, - kernel_code_properties, - amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); - PRINT_IF_NOT_DEFAULT( - OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD, DefaultKD, + PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_buffer", KD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER); + PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR); + PRINT_FIELD(OS, ".amdhsa_user_sgpr_queue_ptr", KD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR); + PRINT_FIELD(OS, ".amdhsa_user_sgpr_kernarg_segment_ptr", KD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR); + PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_id", KD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID); + PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); + PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_size", KD, + kernel_code_properties, + amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE); + PRINT_FIELD( + OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD, compute_pgm_rsrc2, amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET); - PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD, DefaultKD, - compute_pgm_rsrc2, - amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X); - PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_y", KD, DefaultKD, - compute_pgm_rsrc2, - amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y); - PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_id_z", KD, DefaultKD, - compute_pgm_rsrc2, - amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z); - PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_sgpr_workgroup_info", KD, DefaultKD, - compute_pgm_rsrc2, - amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO); - PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_system_vgpr_workitem_id", KD, DefaultKD, - compute_pgm_rsrc2, - amdhsa::COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID); + PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X); + PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_y", KD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y); + PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_z", KD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z); + PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_info", KD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO); + PRINT_FIELD(OS, ".amdhsa_system_vgpr_workitem_id", KD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID); // These directives are required. OS << "\t\t.amdhsa_next_free_vgpr " << NextVGPR << '\n'; @@ -279,54 +309,52 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( if (IVersion.Major >= 8 && ReserveXNACK != hasXNACK(STI)) OS << "\t\t.amdhsa_reserve_xnack_mask " << ReserveXNACK << '\n'; - PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_round_mode_32", KD, DefaultKD, - compute_pgm_rsrc1, - amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32); - PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_round_mode_16_64", KD, DefaultKD, - compute_pgm_rsrc1, - amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64); - PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_denorm_mode_32", KD, DefaultKD, - compute_pgm_rsrc1, - amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32); - PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_float_denorm_mode_16_64", KD, DefaultKD, - compute_pgm_rsrc1, - amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64); - PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_dx10_clamp", KD, DefaultKD, - compute_pgm_rsrc1, - amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP); - PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_ieee_mode", KD, DefaultKD, - compute_pgm_rsrc1, - amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE); + PRINT_FIELD(OS, ".amdhsa_float_round_mode_32", KD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32); + PRINT_FIELD(OS, ".amdhsa_float_round_mode_16_64", KD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64); + PRINT_FIELD(OS, ".amdhsa_float_denorm_mode_32", KD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32); + PRINT_FIELD(OS, ".amdhsa_float_denorm_mode_16_64", KD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64); + PRINT_FIELD(OS, ".amdhsa_dx10_clamp", KD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP); + PRINT_FIELD(OS, ".amdhsa_ieee_mode", KD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE); if (IVersion.Major >= 9) - PRINT_IF_NOT_DEFAULT(OS, ".amdhsa_fp16_overflow", KD, DefaultKD, - compute_pgm_rsrc1, - amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL); - PRINT_IF_NOT_DEFAULT( - OS, ".amdhsa_exception_fp_ieee_invalid_op", KD, DefaultKD, + PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL); + PRINT_FIELD( + OS, ".amdhsa_exception_fp_ieee_invalid_op", KD, compute_pgm_rsrc2, amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION); - PRINT_IF_NOT_DEFAULT( - OS, ".amdhsa_exception_fp_denorm_src", KD, DefaultKD, compute_pgm_rsrc2, - amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE); - PRINT_IF_NOT_DEFAULT( - OS, ".amdhsa_exception_fp_ieee_div_zero", KD, DefaultKD, + PRINT_FIELD(OS, ".amdhsa_exception_fp_denorm_src", KD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE); + PRINT_FIELD( + OS, ".amdhsa_exception_fp_ieee_div_zero", KD, compute_pgm_rsrc2, amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO); - PRINT_IF_NOT_DEFAULT( - OS, ".amdhsa_exception_fp_ieee_overflow", KD, DefaultKD, - compute_pgm_rsrc2, - amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW); - PRINT_IF_NOT_DEFAULT( - OS, ".amdhsa_exception_fp_ieee_underflow", KD, DefaultKD, - compute_pgm_rsrc2, - amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW); - PRINT_IF_NOT_DEFAULT( - OS, ".amdhsa_exception_fp_ieee_inexact", KD, DefaultKD, compute_pgm_rsrc2, - amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT); - PRINT_IF_NOT_DEFAULT( - OS, ".amdhsa_exception_int_div_zero", KD, DefaultKD, compute_pgm_rsrc2, - amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO); -#undef PRINT_IF_NOT_DEFAULT + PRINT_FIELD(OS, ".amdhsa_exception_fp_ieee_overflow", KD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW); + PRINT_FIELD(OS, ".amdhsa_exception_fp_ieee_underflow", KD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW); + PRINT_FIELD(OS, ".amdhsa_exception_fp_ieee_inexact", KD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT); + PRINT_FIELD(OS, ".amdhsa_exception_int_div_zero", KD, + compute_pgm_rsrc2, + amdhsa::COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO); +#undef PRINT_FIELD OS << "\t.end_amdhsa_kernel\n"; } @@ -342,12 +370,16 @@ AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer( unsigned EFlags = MCA.getELFHeaderEFlags(); EFlags &= ~ELF::EF_AMDGPU_MACH; - EFlags |= getMACH(STI.getCPU()); + EFlags |= getElfMach(STI.getCPU()); EFlags &= ~ELF::EF_AMDGPU_XNACK; if (AMDGPU::hasXNACK(STI)) EFlags |= ELF::EF_AMDGPU_XNACK; + EFlags &= ~ELF::EF_AMDGPU_SRAM_ECC; + if (AMDGPU::hasSRAMECC(STI)) + EFlags |= ELF::EF_AMDGPU_SRAM_ECC; + MCA.setELFHeaderEFlags(EFlags); } @@ -355,13 +387,13 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() { return static_cast<MCELFStreamer &>(Streamer); } -void AMDGPUTargetELFStreamer::EmitAMDGPUNote( - const MCExpr *DescSZ, unsigned NoteType, +void AMDGPUTargetELFStreamer::EmitNote( + StringRef Name, const MCExpr *DescSZ, unsigned NoteType, function_ref<void(MCELFStreamer &)> EmitDesc) { auto &S = getStreamer(); auto &Context = S.getContext(); - auto NameSZ = sizeof(ElfNote::NoteName); + auto NameSZ = Name.size() + 1; S.PushSection(); S.SwitchSection(Context.getELFSection( @@ -369,7 +401,7 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUNote( S.EmitIntValue(NameSZ, 4); // namesz S.EmitValue(DescSZ, 4); // descz S.EmitIntValue(NoteType, 4); // type - S.EmitBytes(StringRef(ElfNote::NoteName, NameSZ)); // name + S.EmitBytes(Name); // name S.EmitValueToAlignment(4, 0, 1, 0); // padding 0 EmitDesc(S); // desc S.EmitValueToAlignment(4, 0, 1, 0); // padding 0 @@ -381,14 +413,11 @@ void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {} void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion( uint32_t Major, uint32_t Minor) { - EmitAMDGPUNote( - MCConstantExpr::create(8, getContext()), - ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION, - [&](MCELFStreamer &OS){ - OS.EmitIntValue(Major, 4); - OS.EmitIntValue(Minor, 4); - } - ); + EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(8, getContext()), + ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS) { + OS.EmitIntValue(Major, 4); + OS.EmitIntValue(Minor, 4); + }); } void @@ -404,21 +433,18 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major, sizeof(Major) + sizeof(Minor) + sizeof(Stepping) + VendorNameSize + ArchNameSize; - EmitAMDGPUNote( - MCConstantExpr::create(DescSZ, getContext()), - ElfNote::NT_AMDGPU_HSA_ISA, - [&](MCELFStreamer &OS) { - OS.EmitIntValue(VendorNameSize, 2); - OS.EmitIntValue(ArchNameSize, 2); - OS.EmitIntValue(Major, 4); - OS.EmitIntValue(Minor, 4); - OS.EmitIntValue(Stepping, 4); - OS.EmitBytes(VendorName); - OS.EmitIntValue(0, 1); // NULL terminate VendorName - OS.EmitBytes(ArchName); - OS.EmitIntValue(0, 1); // NULL terminte ArchName - } - ); + EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(DescSZ, getContext()), + ElfNote::NT_AMDGPU_HSA_ISA, [&](MCELFStreamer &OS) { + OS.EmitIntValue(VendorNameSize, 2); + OS.EmitIntValue(ArchNameSize, 2); + OS.EmitIntValue(Major, 4); + OS.EmitIntValue(Minor, 4); + OS.EmitIntValue(Stepping, 4); + OS.EmitBytes(VendorName); + OS.EmitIntValue(0, 1); // NULL terminate VendorName + OS.EmitBytes(ArchName); + OS.EmitIntValue(0, 1); // NULL terminte ArchName + }); } void @@ -447,15 +473,41 @@ bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) { MCSymbolRefExpr::create(DescEnd, Context), MCSymbolRefExpr::create(DescBegin, Context), Context); - EmitAMDGPUNote( - DescSZ, - ELF::NT_AMD_AMDGPU_ISA, - [&](MCELFStreamer &OS) { - OS.EmitLabel(DescBegin); - OS.EmitBytes(IsaVersionString); - OS.EmitLabel(DescEnd); - } - ); + EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_ISA, + [&](MCELFStreamer &OS) { + OS.EmitLabel(DescBegin); + OS.EmitBytes(IsaVersionString); + OS.EmitLabel(DescEnd); + }); + return true; +} + +bool AMDGPUTargetELFStreamer::EmitHSAMetadata( + std::shared_ptr<msgpack::Node> &HSAMetadataRoot, bool Strict) { + V3::MetadataVerifier Verifier(Strict); + if (!Verifier.verify(*HSAMetadataRoot)) + return false; + + std::string HSAMetadataString; + raw_string_ostream StrOS(HSAMetadataString); + msgpack::Writer MPWriter(StrOS); + HSAMetadataRoot->write(MPWriter); + + // Create two labels to mark the beginning and end of the desc field + // and a MCExpr to calculate the size of the desc field. + auto &Context = getContext(); + auto *DescBegin = Context.createTempSymbol(); + auto *DescEnd = Context.createTempSymbol(); + auto *DescSZ = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(DescEnd, Context), + MCSymbolRefExpr::create(DescBegin, Context), Context); + + EmitNote(ElfNote::NoteNameV3, DescSZ, ELF::NT_AMDGPU_METADATA, + [&](MCELFStreamer &OS) { + OS.EmitLabel(DescBegin); + OS.EmitBytes(StrOS.str()); + OS.EmitLabel(DescEnd); + }); return true; } @@ -474,28 +526,24 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata( MCSymbolRefExpr::create(DescEnd, Context), MCSymbolRefExpr::create(DescBegin, Context), Context); - EmitAMDGPUNote( - DescSZ, - ELF::NT_AMD_AMDGPU_HSA_METADATA, - [&](MCELFStreamer &OS) { - OS.EmitLabel(DescBegin); - OS.EmitBytes(HSAMetadataString); - OS.EmitLabel(DescEnd); - } - ); + EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_HSA_METADATA, + [&](MCELFStreamer &OS) { + OS.EmitLabel(DescBegin); + OS.EmitBytes(HSAMetadataString); + OS.EmitLabel(DescEnd); + }); return true; } bool AMDGPUTargetELFStreamer::EmitPALMetadata( const PALMD::Metadata &PALMetadata) { - EmitAMDGPUNote( - MCConstantExpr::create(PALMetadata.size() * sizeof(uint32_t), getContext()), - ELF::NT_AMD_AMDGPU_PAL_METADATA, - [&](MCELFStreamer &OS){ - for (auto I : PALMetadata) - OS.EmitIntValue(I, sizeof(uint32_t)); - } - ); + EmitNote(ElfNote::NoteNameV2, + MCConstantExpr::create(PALMetadata.size() * sizeof(uint32_t), + getContext()), + ELF::NT_AMD_AMDGPU_PAL_METADATA, [&](MCELFStreamer &OS) { + for (auto I : PALMetadata) + OS.EmitIntValue(I, sizeof(uint32_t)); + }); return true; } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index 472da1b735936..9a807c804f9ff 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -11,6 +11,7 @@ #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H #include "AMDKernelCodeT.h" +#include "llvm/BinaryFormat/MsgPackTypes.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/AMDGPUMetadata.h" @@ -31,13 +32,7 @@ class AMDGPUTargetStreamer : public MCTargetStreamer { protected: MCContext &getContext() const { return Streamer.getContext(); } - /// \returns Equivalent EF_AMDGPU_MACH_* value for given \p GPU name. - unsigned getMACH(StringRef GPU) const; - public: - /// \returns Equivalent GPU name for an EF_AMDGPU_MACH_* value. - static const char *getMachName(unsigned Mach); - AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {} virtual void EmitDirectiveAMDGCNTarget(StringRef Target) = 0; @@ -58,7 +53,20 @@ public: virtual bool EmitISAVersion(StringRef IsaVersionString) = 0; /// \returns True on success, false on failure. - virtual bool EmitHSAMetadata(StringRef HSAMetadataString); + virtual bool EmitHSAMetadataV2(StringRef HSAMetadataString); + + /// \returns True on success, false on failure. + virtual bool EmitHSAMetadataV3(StringRef HSAMetadataString); + + /// Emit HSA Metadata + /// + /// When \p Strict is true, known metadata elements must already be + /// well-typed. When \p Strict is false, known types are inferred and + /// the \p HSAMetadata structure is updated with the correct types. + /// + /// \returns True on success, false on failure. + virtual bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata, + bool Strict) = 0; /// \returns True on success, false on failure. virtual bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) = 0; @@ -71,6 +79,9 @@ public: const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, bool ReserveXNACK) = 0; + + static StringRef getArchNameFromElfMach(unsigned ElfMach); + static unsigned getElfMach(StringRef GPU); }; class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer { @@ -95,6 +106,10 @@ public: bool EmitISAVersion(StringRef IsaVersionString) override; /// \returns True on success, false on failure. + bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata, + bool Strict) override; + + /// \returns True on success, false on failure. bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override; /// \returns True on success, false on failure. @@ -110,8 +125,8 @@ public: class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer { MCStreamer &Streamer; - void EmitAMDGPUNote(const MCExpr *DescSize, unsigned NoteType, - function_ref<void(MCELFStreamer &)> EmitDesc); + void EmitNote(StringRef Name, const MCExpr *DescSize, unsigned NoteType, + function_ref<void(MCELFStreamer &)> EmitDesc); public: AMDGPUTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI); @@ -135,6 +150,10 @@ public: bool EmitISAVersion(StringRef IsaVersionString) override; /// \returns True on success, false on failure. + bool EmitHSAMetadata(std::shared_ptr<msgpack::Node> &HSAMetadata, + bool Strict) override; + + /// \returns True on success, false on failure. bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override; /// \returns True on success, false on failure. diff --git a/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt b/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt index 773ee7c0a4ba7..bc910a470d72a 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt +++ b/lib/Target/AMDGPU/MCTargetDesc/LLVMBuild.txt @@ -19,5 +19,5 @@ type = Library name = AMDGPUDesc parent = AMDGPU -required_libraries = Core MC AMDGPUAsmPrinter AMDGPUInfo AMDGPUUtils Support +required_libraries = Core MC AMDGPUAsmPrinter AMDGPUInfo AMDGPUUtils Support BinaryFormat add_to_library_groups = AMDGPU diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td index 44c2d366e4613..1c68dbd78e758 100644 --- a/lib/Target/AMDGPU/MIMGInstructions.td +++ b/lib/Target/AMDGPU/MIMGInstructions.td @@ -29,6 +29,7 @@ class MIMGBaseOpcode { bit Atomic = 0; bit AtomicX2 = 0; // (f)cmpswap bit Sampler = 0; + bit Gather4 = 0; bits<8> NumExtraArgs = 0; bit Gradients = 0; bit Coordinates = 1; @@ -43,7 +44,7 @@ def MIMGBaseOpcode : GenericEnum { def MIMGBaseOpcodesTable : GenericTable { let FilterClass = "MIMGBaseOpcode"; let CppTypeName = "MIMGBaseOpcodeInfo"; - let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", + let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "Gather4", "NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip", "HasD16"]; GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode; @@ -141,7 +142,7 @@ class MIMG_NoSampler_Helper <bits<7> op, string asm, let InOperandList = !con((ins addr_rc:$vaddr, SReg_256:$srsrc, DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, - R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da), + R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" #!if(BaseOpcode.HasD16, "$d16", ""); @@ -179,6 +180,8 @@ multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0, defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>; let VDataDwords = 4 in defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>; + let VDataDwords = 8 in + defm _V8 : MIMG_NoSampler_Src_Helper <op, asm, VReg_256, 0>; } } @@ -199,7 +202,7 @@ class MIMG_Store_Helper <bits<7> op, string asm, let InOperandList = !con((ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, - R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da), + R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); let AsmString = asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" #!if(BaseOpcode.HasD16, "$d16", ""); @@ -252,7 +255,7 @@ class MIMG_Atomic_Helper <string asm, RegisterClass data_rc, let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, - R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da); + R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da); let AsmString = asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da"; } @@ -316,7 +319,7 @@ class MIMG_Sampler_Helper <bits<7> op, string asm, RegisterClass dst_rc, let InOperandList = !con((ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask, UNorm:$unorm, GLC:$glc, SLC:$slc, - R128:$r128, TFE:$tfe, LWE:$lwe, DA:$da), + R128A16:$r128, TFE:$tfe, LWE:$lwe, DA:$da), !if(BaseOpcode.HasD16, (ins D16:$d16), (ins))); let AsmString = asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da" #!if(BaseOpcode.HasD16, "$d16", ""); @@ -411,6 +414,8 @@ multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0, defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>; let VDataDwords = 4 in defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>; + let VDataDwords = 8 in + defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>; } } @@ -421,6 +426,7 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0, string asm = "image_gather4"#sample.LowerCaseMod> { def "" : MIMG_Sampler_BaseOpcode<sample> { let HasD16 = 1; + let Gather4 = 1; } let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm, @@ -429,6 +435,8 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0, defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */ let VDataDwords = 4 in defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>; + let VDataDwords = 8 in + defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>; } } diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp index 1683fe6c9a571..679cf18d2c20b 100644 --- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp +++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -226,11 +226,11 @@ private: // occur in the same basic block as its definition, because // it is illegal for the scheduler to schedule them in // different blocks. - if (UseI->readsRegister(MOI->getReg())) + if (UseI->readsRegister(MOI->getReg(), &TRI)) LastUseCount = AluInstCount; // Exit early if the current use kills the register - if (UseI != Def && UseI->killsRegister(MOI->getReg())) + if (UseI != Def && UseI->killsRegister(MOI->getReg(), &TRI)) break; } if (LastUseCount) diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 113d6249fa60a..e2a0f05d2b34d 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -589,7 +589,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const } case Intrinsic::r600_implicitarg_ptr: { - MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUASI.PARAM_I_ADDRESS); + MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS); uint32_t ByteOffset = getImplicitParameterOffset(MF, FIRST_IMPLICIT); return DAG.getConstant(ByteOffset, DL, PtrVT); } @@ -741,12 +741,12 @@ SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); - if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS) + if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); const DataLayout &DL = DAG.getDataLayout(); const GlobalValue *GV = GSD->getGlobal(); - MVT ConstPtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS); + MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT); return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA); @@ -903,7 +903,7 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, unsigned DwordOffset) const { unsigned ByteOffset = DwordOffset * 4; PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUASI.CONSTANT_BUFFER_0); + AMDGPUAS::PARAM_I_ADDRESS); // We shouldn't be using an offset wider than 16-bits for implicit parameters. assert(isInt<16>(ByteOffset)); @@ -1141,7 +1141,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, //TODO: Who creates the i8 stores? assert(Store->isTruncatingStore() || Store->getValue().getValueType() == MVT::i8); - assert(Store->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS); + assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS); SDValue Mask; if (Store->getMemoryVT() == MVT::i8) { @@ -1175,7 +1175,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, // Load dword // TODO: can we be smarter about machine pointer info? MachinePointerInfo PtrInfo(UndefValue::get( - Type::getInt32PtrTy(*DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS))); + Type::getInt32PtrTy(*DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS))); SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo); Chain = Dst.getValue(1); @@ -1241,9 +1241,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); // Neither LOCAL nor PRIVATE can do vectors at the moment - if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS) && + if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) && VT.isVector()) { - if ((AS == AMDGPUASI.PRIVATE_ADDRESS) && + if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && StoreNode->isTruncatingStore()) { // Add an extra level of chain to isolate this vector SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain); @@ -1267,7 +1267,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr, DAG.getConstant(2, DL, PtrVT)); - if (AS == AMDGPUASI.GLOBAL_ADDRESS) { + if (AS == AMDGPUAS::GLOBAL_ADDRESS) { // It is beneficial to create MSKOR here instead of combiner to avoid // artificial dependencies introduced by RMW if (StoreNode->isTruncatingStore()) { @@ -1320,7 +1320,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { } // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes - if (AS != AMDGPUASI.PRIVATE_ADDRESS) + if (AS != AMDGPUAS::PRIVATE_ADDRESS) return SDValue(); if (MemVT.bitsLT(MVT::i32)) @@ -1403,7 +1403,7 @@ SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op, // Load dword // TODO: can we be smarter about machine pointer info? MachinePointerInfo PtrInfo(UndefValue::get( - Type::getInt32PtrTy(*DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS))); + Type::getInt32PtrTy(*DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS))); SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo); // Get offset within the register. @@ -1441,7 +1441,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { EVT MemVT = LoadNode->getMemoryVT(); ISD::LoadExtType ExtType = LoadNode->getExtensionType(); - if (AS == AMDGPUASI.PRIVATE_ADDRESS && + if (AS == AMDGPUAS::PRIVATE_ADDRESS && ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) { return lowerPrivateExtLoad(Op, DAG); } @@ -1451,45 +1451,29 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = LoadNode->getChain(); SDValue Ptr = LoadNode->getBasePtr(); - if ((LoadNode->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS || - LoadNode->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS) && + if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && VT.isVector()) { return scalarizeVectorLoad(LoadNode, DAG); } + // This is still used for explicit load from addrspace(8) int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); if (ConstantBlock > -1 && ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { SDValue Result; - if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) || - isa<Constant>(LoadNode->getMemOperand()->getValue()) || + if (isa<Constant>(LoadNode->getMemOperand()->getValue()) || isa<ConstantSDNode>(Ptr)) { - SDValue Slots[4]; - for (unsigned i = 0; i < 4; i++) { - // We want Const position encoded with the following formula : - // (((512 + (kc_bank << 12) + const_index) << 2) + chan) - // const_index is Ptr computed by llvm using an alignment of 16. - // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and - // then div by 4 at the ISel step - SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, - DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32)); - Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); - } - EVT NewVT = MVT::v4i32; - unsigned NumElements = 4; - if (VT.isVector()) { - NewVT = VT; - NumElements = VT.getVectorNumElements(); - } - Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements)); + return constBufferLoad(LoadNode, LoadNode->getAddressSpace(), DAG); } else { + //TODO: Does this even work? // non-constant ptr can't be folded, keeps it as a v4f32 load Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, DL, MVT::i32)), DAG.getConstant(LoadNode->getAddressSpace() - - AMDGPUASI.CONSTANT_BUFFER_0, DL, MVT::i32) + AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32) ); } @@ -1525,7 +1509,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { return DAG.getMergeValues(MergedValues, DL); } - if (LoadNode->getAddressSpace() != AMDGPUASI.PRIVATE_ADDRESS) { + if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { return SDValue(); } @@ -1622,7 +1606,7 @@ SDValue R600TargetLowering::LowerFormalArguments( } PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUASI.CONSTANT_BUFFER_0); + AMDGPUAS::PARAM_I_ADDRESS); // i64 isn't a legal type, so the register type used ends up as i32, which // isn't expected here. It attempts to create this sextload, but it ends up @@ -1646,17 +1630,17 @@ SDValue R600TargetLowering::LowerFormalArguments( unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); unsigned PartOffset = VA.getLocMemOffset(); + unsigned Alignment = MinAlign(VT.getStoreSize(), PartOffset); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); SDValue Arg = DAG.getLoad( ISD::UNINDEXED, Ext, VT, DL, Chain, DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo, - MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal | + MemVT, Alignment, MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); - // 4 is the preferred alignment for the CONSTANT memory space. InVals.push_back(Arg); } return Chain; @@ -1672,7 +1656,7 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, const SelectionDAG &DAG) const { // Local and Private addresses do not handle vectors. Limit to i32 - if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS)) { + if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS)) { return (MemVT.getSizeInBits() <= 32); } return true; @@ -1701,14 +1685,15 @@ bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, static SDValue CompactSwizzlableVector( SelectionDAG &DAG, SDValue VectorEntry, DenseMap<unsigned, unsigned> &RemapSwizzle) { - assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); assert(RemapSwizzle.empty()); - SDValue NewBldVec[4] = { - VectorEntry.getOperand(0), - VectorEntry.getOperand(1), - VectorEntry.getOperand(2), - VectorEntry.getOperand(3) - }; + + SDLoc DL(VectorEntry); + EVT EltTy = VectorEntry.getValueType().getVectorElementType(); + + SDValue NewBldVec[4]; + for (unsigned i = 0; i < 4; i++) + NewBldVec[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorEntry, + DAG.getIntPtrConstant(i, DL)); for (unsigned i = 0; i < 4; i++) { if (NewBldVec[i].isUndef()) @@ -1743,15 +1728,17 @@ static SDValue CompactSwizzlableVector( static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, DenseMap<unsigned, unsigned> &RemapSwizzle) { - assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); assert(RemapSwizzle.empty()); - SDValue NewBldVec[4] = { - VectorEntry.getOperand(0), - VectorEntry.getOperand(1), - VectorEntry.getOperand(2), - VectorEntry.getOperand(3) - }; - bool isUnmovable[4] = { false, false, false, false }; + + SDLoc DL(VectorEntry); + EVT EltTy = VectorEntry.getValueType().getVectorElementType(); + + SDValue NewBldVec[4]; + bool isUnmovable[4] = {false, false, false, false}; + for (unsigned i = 0; i < 4; i++) + NewBldVec[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltTy, VectorEntry, + DAG.getIntPtrConstant(i, DL)); + for (unsigned i = 0; i < 4; i++) { RemapSwizzle[i] = i; if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { @@ -1782,7 +1769,6 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4], SelectionDAG &DAG, const SDLoc &DL) const { - assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); // Old -> New swizzle values DenseMap<unsigned, unsigned> SwizzleRemap; @@ -1804,6 +1790,52 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4], return BuildVector; } +SDValue R600TargetLowering::constBufferLoad(LoadSDNode *LoadNode, int Block, + SelectionDAG &DAG) const { + SDLoc DL(LoadNode); + EVT VT = LoadNode->getValueType(0); + SDValue Chain = LoadNode->getChain(); + SDValue Ptr = LoadNode->getBasePtr(); + assert (isa<ConstantSDNode>(Ptr)); + + //TODO: Support smaller loads + if (LoadNode->getMemoryVT().getScalarType() != MVT::i32 || !ISD::isNON_EXTLoad(LoadNode)) + return SDValue(); + + if (LoadNode->getAlignment() < 4) + return SDValue(); + + int ConstantBlock = ConstantAddressBlock(Block); + + SDValue Slots[4]; + for (unsigned i = 0; i < 4; i++) { + // We want Const position encoded with the following formula : + // (((512 + (kc_bank << 12) + const_index) << 2) + chan) + // const_index is Ptr computed by llvm using an alignment of 16. + // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and + // then div by 4 at the ISel step + SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32)); + Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); + } + EVT NewVT = MVT::v4i32; + unsigned NumElements = 4; + if (VT.isVector()) { + NewVT = VT; + NumElements = VT.getVectorNumElements(); + } + SDValue Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements)); + if (!VT.isVector()) { + Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, + DAG.getConstant(0, DL, MVT::i32)); + } + SDValue MergedValues[2] = { + Result, + Chain + }; + return DAG.getMergeValues(MergedValues, DL); +} + //===----------------------------------------------------------------------===// // Custom DAG Optimizations //===----------------------------------------------------------------------===// @@ -2022,6 +2054,16 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL); return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs); } + + case ISD::LOAD: { + LoadSDNode *LoadNode = cast<LoadSDNode>(N); + SDValue Ptr = LoadNode->getBasePtr(); + if (LoadNode->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS && + isa<ConstantSDNode>(Ptr)) + return constBufferLoad(LoadNode, AMDGPUAS::CONSTANT_BUFFER_0, DAG); + break; + } + default: break; } diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h index 907d1f10e1519..767c3c7bd5bfe 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.h +++ b/lib/Target/AMDGPU/R600ISelLowering.h @@ -98,9 +98,11 @@ private: bool isHWTrueValue(SDValue Op) const; bool isHWFalseValue(SDValue Op) const; - bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, - SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm, - SelectionDAG &DAG) const; + bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, + SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm, + SelectionDAG &DAG) const; + SDValue constBufferLoad(LoadSDNode *LoadNode, int Block, + SelectionDAG &DAG) const; SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; }; diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp index 5397e779474c8..9cc3e5f3c314c 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -229,11 +229,11 @@ bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { } bool R600InstrInfo::usesAddressRegister(MachineInstr &MI) const { - return MI.findRegisterUseOperandIdx(R600::AR_X) != -1; + return MI.findRegisterUseOperandIdx(R600::AR_X, false, &RI) != -1; } bool R600InstrInfo::definesAddressRegister(MachineInstr &MI) const { - return MI.findRegisterDefOperandIdx(R600::AR_X) != -1; + return MI.findRegisterDefOperandIdx(R600::AR_X, false, false, &RI) != -1; } bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const { @@ -1500,19 +1500,19 @@ void R600InstrInfo::clearFlag(MachineInstr &MI, unsigned Operand, } unsigned R600InstrInfo::getAddressSpaceForPseudoSourceKind( - PseudoSourceValue::PSVKind Kind) const { + unsigned Kind) const { switch (Kind) { case PseudoSourceValue::Stack: case PseudoSourceValue::FixedStack: - return ST.getAMDGPUAS().PRIVATE_ADDRESS; + return AMDGPUAS::PRIVATE_ADDRESS; case PseudoSourceValue::ConstantPool: case PseudoSourceValue::GOT: case PseudoSourceValue::JumpTable: case PseudoSourceValue::GlobalValueCallEntry: case PseudoSourceValue::ExternalSymbolCallEntry: case PseudoSourceValue::TargetCustom: - return ST.getAMDGPUAS().CONSTANT_ADDRESS; + return AMDGPUAS::CONSTANT_ADDRESS; } + llvm_unreachable("Invalid pseudo source kind"); - return ST.getAMDGPUAS().PRIVATE_ADDRESS; } diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h index 7a3dece316650..e6e34dc125f4d 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.h +++ b/lib/Target/AMDGPU/R600InstrInfo.h @@ -324,7 +324,7 @@ public: } unsigned getAddressSpaceForPseudoSourceKind( - PseudoSourceValue::PSVKind Kind) const override; + unsigned Kind) const override; }; namespace R600 { diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td index 7bf174f4cd864..10e8737552224 100644 --- a/lib/Target/AMDGPU/R600Instructions.td +++ b/lib/Target/AMDGPU/R600Instructions.td @@ -299,7 +299,7 @@ class VTX_READ <string name, dag outs, list<dag> pattern> class LoadParamFrag <PatFrag load_type> : PatFrag < (ops node:$ptr), (load_type node:$ptr), [{ return isConstantLoad(cast<LoadSDNode>(N), 0) || - (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUASI.PARAM_I_ADDRESS); }] + (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS); }] >; def vtx_id3_az_extloadi8 : LoadParamFrag<az_extloadi8>; @@ -309,8 +309,8 @@ def vtx_id3_load : LoadParamFrag<load>; class LoadVtxId1 <PatFrag load> : PatFrag < (ops node:$ptr), (load node:$ptr), [{ const MemSDNode *LD = cast<MemSDNode>(N); - return LD->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || - (LD->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && + return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + (LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && !isa<GlobalValue>(GetUnderlyingObject( LD->getMemOperand()->getValue(), CurDAG->getDataLayout()))); }]>; @@ -322,7 +322,7 @@ def vtx_id1_load : LoadVtxId1 <load>; class LoadVtxId2 <PatFrag load> : PatFrag < (ops node:$ptr), (load node:$ptr), [{ const MemSDNode *LD = cast<MemSDNode>(N); - return LD->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && + return LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && isa<GlobalValue>(GetUnderlyingObject( LD->getMemOperand()->getValue(), CurDAG->getDataLayout())); }]>; diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp index a1429a2ac50f1..7769a35aadcee 100644 --- a/lib/Target/AMDGPU/R600MachineScheduler.cpp +++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp @@ -127,13 +127,13 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { LLVM_DEBUG(if (SU) { dbgs() << " ** Pick node **\n"; - SU->dump(DAG); + DAG->dumpNode(*SU); } else { dbgs() << "NO NODE \n"; for (unsigned i = 0; i < DAG->SUnits.size(); i++) { const SUnit &S = DAG->SUnits[i]; if (!S.isScheduled) - S.dump(DAG); + DAG->dumpNode(S); } }); @@ -188,11 +188,11 @@ isPhysicalRegCopy(MachineInstr *MI) { } void R600SchedStrategy::releaseTopNode(SUnit *SU) { - LLVM_DEBUG(dbgs() << "Top Releasing "; SU->dump(DAG);); + LLVM_DEBUG(dbgs() << "Top Releasing "; DAG->dumpNode(*SU)); } void R600SchedStrategy::releaseBottomNode(SUnit *SU) { - LLVM_DEBUG(dbgs() << "Bottom Releasing "; SU->dump(DAG);); + LLVM_DEBUG(dbgs() << "Bottom Releasing "; DAG->dumpNode(*SU)); if (isPhysicalRegCopy(SU->getInstr())) { PhysicalRegCopy.push_back(SU); return; @@ -236,6 +236,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { // MI will become a KILL, don't considers it in scheduling return AluDiscarded; } + break; default: break; } diff --git a/lib/Target/AMDGPU/SIAddIMGInit.cpp b/lib/Target/AMDGPU/SIAddIMGInit.cpp new file mode 100644 index 0000000000000..69cafef4a3513 --- /dev/null +++ b/lib/Target/AMDGPU/SIAddIMGInit.cpp @@ -0,0 +1,181 @@ +//===-- SIAddIMGInit.cpp - Add any required IMG inits ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Any MIMG instructions that use tfe or lwe require an initialization of the +/// result register that will be written in the case of a memory access failure +/// The required code is also added to tie this init code to the result of the +/// img instruction +/// +//===----------------------------------------------------------------------===// +// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "si-img-init" + +using namespace llvm; + +namespace { + +class SIAddIMGInit : public MachineFunctionPass { +public: + static char ID; + +public: + SIAddIMGInit() : MachineFunctionPass(ID) { + initializeSIAddIMGInitPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIAddIMGInit, DEBUG_TYPE, "SI Add IMG Init", false, false) + +char SIAddIMGInit::ID = 0; + +char &llvm::SIAddIMGInitID = SIAddIMGInit::ID; + +FunctionPass *llvm::createSIAddIMGInitPass() { return new SIAddIMGInit(); } + +bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *RI = ST.getRegisterInfo(); + bool Changed = false; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; + ++BI) { + MachineBasicBlock &MBB = *BI; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + + auto Opcode = MI.getOpcode(); + if (TII->isMIMG(Opcode) && !MI.mayStore()) { + MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe); + MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe); + MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16); + + // Check for instructions that don't have tfe or lwe fields + // There shouldn't be any at this point. + assert( (TFE && LWE) && "Expected tfe and lwe operands in instruction"); + + unsigned TFEVal = TFE->getImm(); + unsigned LWEVal = LWE->getImm(); + unsigned D16Val = D16 ? D16->getImm() : 0; + + if (TFEVal || LWEVal) { + // At least one of TFE or LWE are non-zero + // We have to insert a suitable initialization of the result value and + // tie this to the dest of the image instruction. + + const DebugLoc &DL = MI.getDebugLoc(); + + int DstIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); + + // Calculate which dword we have to initialize to 0. + MachineOperand *MO_Dmask = + TII->getNamedOperand(MI, AMDGPU::OpName::dmask); + + // check that dmask operand is found. + assert(MO_Dmask && "Expected dmask operand in instruction"); + + unsigned dmask = MO_Dmask->getImm(); + // Determine the number of active lanes taking into account the + // Gather4 special case + unsigned ActiveLanes = + TII->isGather4(Opcode) ? 4 : countPopulation(dmask); + + // Subreg indices are counted from 1 + // When D16 then we want next whole VGPR after write data. + static_assert(AMDGPU::sub0 == 1 && AMDGPU::sub4 == 5, "Subreg indices different from expected"); + + bool Packed = !ST.hasUnpackedD16VMem(); + + unsigned InitIdx = + D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1; + + // Abandon attempt if the dst size isn't large enough + // - this is in fact an error but this is picked up elsewhere and + // reported correctly. + uint32_t DstSize = + RI->getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; + if (DstSize < InitIdx) + continue; + + // Create a register for the intialization value. + unsigned PrevDst = + MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); + unsigned NewDst = 0; // Final initialized value will be in here + + // If PRTStrictNull feature is enabled (the default) then initialize + // all the result registers to 0, otherwise just the error indication + // register (VGPRn+1) + unsigned SizeLeft = ST.usePRTStrictNull() ? InitIdx : 1; + unsigned CurrIdx = ST.usePRTStrictNull() ? 1 : InitIdx; + + if (DstSize == 1) { + // In this case we can just initialize the result directly + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), PrevDst) + .addImm(0); + NewDst = PrevDst; + } else { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst); + for (; SizeLeft; SizeLeft--, CurrIdx++) { + NewDst = + MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); + // Initialize dword + unsigned SubReg = + MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg) + .addImm(0); + // Insert into the super-reg + BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst) + .addReg(PrevDst) + .addReg(SubReg) + .addImm(CurrIdx); + + PrevDst = NewDst; + } + } + + // Add as an implicit operand + MachineInstrBuilder(MF, MI).addReg(NewDst, RegState::Implicit); + + // Tie the just added implicit operand to the dst + MI.tieOperands(DstIdx, MI.getNumOperands() - 1); + + Changed = true; + } + } + } + } + + return Changed; +} diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index 74f1bd8fb9866..98e9ea662324f 100644 --- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -16,7 +16,7 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" @@ -52,7 +52,7 @@ using StackEntry = std::pair<BasicBlock *, Value *>; using StackVector = SmallVector<StackEntry, 16>; class SIAnnotateControlFlow : public FunctionPass { - DivergenceAnalysis *DA; + LegacyDivergenceAnalysis *DA; Type *Boolean; Type *Void; @@ -66,9 +66,7 @@ class SIAnnotateControlFlow : public FunctionPass { Function *If; Function *Else; - Function *Break; Function *IfBreak; - Function *ElseBreak; Function *Loop; Function *EndCf; @@ -95,8 +93,7 @@ class SIAnnotateControlFlow : public FunctionPass { Value * handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L, - BranchInst *Term, - SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions); + BranchInst *Term); void handleLoop(BranchInst *Term); @@ -116,7 +113,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<DivergenceAnalysis>(); + AU.addRequired<LegacyDivergenceAnalysis>(); AU.addPreserved<DominatorTreeWrapperPass>(); FunctionPass::getAnalysisUsage(AU); } @@ -127,7 +124,7 @@ public: INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE, "Annotate SI Control Flow", false, false) @@ -149,9 +146,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) { If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if); Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else); - Break = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_break); IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break); - ElseBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else_break); Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop); EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf); return false; @@ -160,7 +155,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) { /// Is the branch condition uniform or did the StructurizeCFG pass /// consider it as such? bool SIAnnotateControlFlow::isUniform(BranchInst *T) { - return DA->isUniform(T->getCondition()) || + return DA->isUniform(T) || T->getMetadata("structurizecfg.uniform") != nullptr; } @@ -227,76 +222,7 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) { /// Recursively handle the condition leading to a loop Value *SIAnnotateControlFlow::handleLoopCondition( - Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term, - SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions) { - // Only search through PHI nodes which are inside the loop. If we try this - // with PHI nodes that are outside of the loop, we end up inserting new PHI - // nodes outside of the loop which depend on values defined inside the loop. - // This will break the module with - // 'Instruction does not dominate all users!' errors. - PHINode *Phi = nullptr; - if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) { - BasicBlock *Parent = Phi->getParent(); - PHINode *NewPhi = PHINode::Create(Int64, 0, "loop.phi", &Parent->front()); - Value *Ret = NewPhi; - - // Handle all non-constant incoming values first - for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { - Value *Incoming = Phi->getIncomingValue(i); - BasicBlock *From = Phi->getIncomingBlock(i); - if (isa<ConstantInt>(Incoming)) { - NewPhi->addIncoming(Broken, From); - continue; - } - - Phi->setIncomingValue(i, BoolFalse); - Value *PhiArg = handleLoopCondition(Incoming, Broken, L, - Term, LoopPhiConditions); - NewPhi->addIncoming(PhiArg, From); - } - - BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock(); - - for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { - Value *Incoming = Phi->getIncomingValue(i); - if (Incoming != BoolTrue) - continue; - - BasicBlock *From = Phi->getIncomingBlock(i); - if (From == IDom) { - // We're in the following situation: - // IDom/From - // | \ - // | If-block - // | / - // Parent - // where we want to break out of the loop if the If-block is not taken. - // Due to the depth-first traversal, there should be an end.cf - // intrinsic in Parent, and we insert an else.break before it. - // - // Note that the end.cf need not be the first non-phi instruction - // of parent, particularly when we're dealing with a multi-level - // break, but it should occur within a group of intrinsic calls - // at the beginning of the block. - CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt()); - while (OldEnd && OldEnd->getCalledFunction() != EndCf) - OldEnd = dyn_cast<CallInst>(OldEnd->getNextNode()); - if (OldEnd && OldEnd->getCalledFunction() == EndCf) { - Value *Args[] = { OldEnd->getArgOperand(0), NewPhi }; - Ret = CallInst::Create(ElseBreak, Args, "", OldEnd); - continue; - } - } - - TerminatorInst *Insert = From->getTerminator(); - Value *PhiArg = CallInst::Create(Break, Broken, "", Insert); - NewPhi->setIncomingValue(i, PhiArg); - } - - LoopPhiConditions.push_back(WeakTrackingVH(Phi)); - return Ret; - } - + Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term) { if (Instruction *Inst = dyn_cast<Instruction>(Cond)) { BasicBlock *Parent = Inst->getParent(); Instruction *Insert; @@ -335,21 +261,15 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { BasicBlock *Target = Term->getSuccessor(1); PHINode *Broken = PHINode::Create(Int64, 0, "phi.broken", &Target->front()); - SmallVector<WeakTrackingVH, 8> LoopPhiConditions; Value *Cond = Term->getCondition(); Term->setCondition(BoolTrue); - Value *Arg = handleLoopCondition(Cond, Broken, L, Term, LoopPhiConditions); + Value *Arg = handleLoopCondition(Cond, Broken, L, Term); for (BasicBlock *Pred : predecessors(Target)) Broken->addIncoming(Pred == BB ? Arg : Int64Zero, Pred); Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); - for (WeakTrackingVH Val : llvm::reverse(LoopPhiConditions)) { - if (PHINode *Cond = cast_or_null<PHINode>(Val)) - eraseIfUnused(Cond); - } - push(Term->getSuccessor(0), Arg); } @@ -372,7 +292,8 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { Preds.push_back(Pred); } - BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false); + BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, nullptr, + false); } Value *Exec = popSaved(); @@ -386,7 +307,7 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { bool SIAnnotateControlFlow::runOnFunction(Function &F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - DA = &getAnalysis<DivergenceAnalysis>(); + DA = &getAnalysis<LegacyDivergenceAnalysis>(); for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()), E = df_end(&F.getEntryBlock()); I != E; ++I) { diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index a6d28d6999e5f..7f6abc34cff3a 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -88,7 +88,10 @@ enum : uint64_t { IsPacked = UINT64_C(1) << 49, // Is a D16 buffer instruction. - D16Buf = UINT64_C(1) << 50 + D16Buf = UINT64_C(1) << 50, + + // Uses floating point double precision rounding mode + FPDPRounding = UINT64_C(1) << 51 }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 566e0d3febc78..809f5bab46932 100644 --- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -183,13 +183,15 @@ getCopyRegClasses(const MachineInstr &Copy, static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, const TargetRegisterClass *DstRC, const SIRegisterInfo &TRI) { - return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC); + return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) && + TRI.hasVGPRs(SrcRC); } static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, const TargetRegisterClass *DstRC, const SIRegisterInfo &TRI) { - return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC); + return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) && + TRI.hasVGPRs(DstRC); } static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI, @@ -327,9 +329,7 @@ static bool phiHasBreakDef(const MachineInstr &PHI, switch (DefInstr->getOpcode()) { default: break; - case AMDGPU::SI_BREAK: case AMDGPU::SI_IF_BREAK: - case AMDGPU::SI_ELSE_BREAK: return true; case AMDGPU::PHI: if (phiHasBreakDef(*DefInstr, MRI, Visited)) @@ -599,7 +599,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { unsigned SrcReg = MI.getOperand(1).getReg(); if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) { - TII->moveToVALU(MI); + TII->moveToVALU(MI, MDT); break; } @@ -614,7 +614,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { MI.setDesc(TII->get(SMovOp)); break; } - TII->moveToVALU(MI); + TII->moveToVALU(MI, MDT); } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { tryChangeVGPRtoSGPRinCopy(MI, TRI, TII); } @@ -677,7 +677,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { SmallSet<unsigned, 8> Visited; if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) { LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); - TII->moveToVALU(MI); + TII->moveToVALU(MI, MDT); } break; } @@ -690,7 +690,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); - TII->moveToVALU(MI); + TII->moveToVALU(MI, MDT); break; case AMDGPU::INSERT_SUBREG: { const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; @@ -700,7 +700,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { if (TRI->isSGPRClass(DstRC) && (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) { LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); - TII->moveToVALU(MI); + TII->moveToVALU(MI, MDT); } break; } diff --git a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp index 5d613d8874fab..7761418c53364 100644 --- a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp +++ b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp @@ -10,7 +10,7 @@ /// \file /// Computations in WWM can overwrite values in inactive channels for /// variables that the register allocator thinks are dead. This pass adds fake -/// uses of those variables to WWM instructions to make sure that they aren't +/// uses of those variables to their def(s) to make sure that they aren't /// overwritten. /// /// As an example, consider this snippet: @@ -29,25 +29,44 @@ /// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled, /// it would clobber even the inactive channels for which the if-condition is /// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use -/// of %vgpr0 to the WWM instruction to make sure they aren't allocated to the +/// of %vgpr0 to its def to make sure they aren't allocated to the /// same register. /// /// In general, we need to figure out what registers might have their inactive /// channels which are eventually used accidentally clobbered by a WWM -/// instruction. We approximate this using two conditions: +/// instruction. We do that by spotting three separate cases of registers: /// -/// 1. A definition of the variable reaches the WWM instruction. -/// 2. The variable would be live at the WWM instruction if all its defs were -/// partial defs (i.e. considered as a use), ignoring normal uses. +/// 1. A "then phi": the value resulting from phi elimination of a phi node at +/// the end of an if..endif. If there is WWM code in the "then", then we +/// make the def at the end of the "then" branch a partial def by adding an +/// implicit use of the register. /// -/// If a register matches both conditions, then we add an implicit use of it to -/// the WWM instruction. Condition #2 is the heart of the matter: every -/// definition is really a partial definition, since every VALU instruction is -/// implicitly predicated. We can usually ignore this, but WWM forces us not -/// to. Condition #1 prevents false positives if the variable is undefined at -/// the WWM instruction anyways. This is overly conservative in certain cases, -/// especially in uniform control flow, but this is a workaround anyways until -/// LLVM gains the notion of predicated uses and definitions of variables. +/// 2. A "loop exit register": a value written inside a loop but used outside the +/// loop, where there is WWM code inside the loop (the case in the example +/// above). We add an implicit_def of the register in the loop pre-header, +/// and make the original def a partial def by adding an implicit use of the +/// register. +/// +/// 3. A "loop exit phi": the value resulting from phi elimination of a phi node +/// in a loop header. If there is WWM code inside the loop, then we make all +/// defs inside the loop partial defs by adding an implicit use of the +/// register on each one. +/// +/// Note that we do not need to consider an if..else..endif phi. We only need to +/// consider non-uniform control flow, and control flow structurization would +/// have transformed a non-uniform if..else..endif into two if..endifs. +/// +/// The analysis to detect these cases relies on a property of the MIR +/// arising from this pass running straight after PHIElimination and before any +/// coalescing: that any virtual register with more than one definition must be +/// the new register added to lower a phi node by PHIElimination. +/// +/// FIXME: We should detect whether a register in one of the above categories is +/// already live at the WWM code before deciding to add the implicit uses to +/// synthesize its liveness. +/// +/// FIXME: I believe this whole scheme may be flawed due to the possibility of +/// the register allocator doing live interval splitting. /// //===----------------------------------------------------------------------===// @@ -59,7 +78,9 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SparseBitVector.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -71,10 +92,18 @@ namespace { class SIFixWWMLiveness : public MachineFunctionPass { private: + MachineDominatorTree *DomTree; + MachineLoopInfo *LoopInfo; LiveIntervals *LIS = nullptr; + const SIInstrInfo *TII; const SIRegisterInfo *TRI; MachineRegisterInfo *MRI; + std::vector<MachineInstr *> WWMs; + std::vector<MachineOperand *> ThenDefs; + std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopExitDefs; + std::vector<std::pair<MachineOperand *, MachineLoop *>> LoopPhiDefs; + public: static char ID; @@ -84,13 +113,11 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; - bool runOnWWMInstruction(MachineInstr &MI); - - void addDefs(const MachineInstr &MI, SparseBitVector<> &set); - StringRef getPassName() const override { return "SI Fix WWM Liveness"; } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequiredID(MachineDominatorsID); + AU.addRequiredID(MachineLoopInfoID); // Should preserve the same set that TwoAddressInstructions does. AU.addPreserved<SlotIndexes>(); AU.addPreserved<LiveIntervals>(); @@ -100,11 +127,21 @@ public: AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } + +private: + void processDef(MachineOperand &DefOpnd); + bool processThenDef(MachineOperand *DefOpnd); + bool processLoopExitDef(MachineOperand *DefOpnd, MachineLoop *Loop); + bool processLoopPhiDef(MachineOperand *DefOpnd, MachineLoop *Loop); }; } // End anonymous namespace. -INITIALIZE_PASS(SIFixWWMLiveness, DEBUG_TYPE, +INITIALIZE_PASS_BEGIN(SIFixWWMLiveness, DEBUG_TYPE, + "SI fix WWM liveness", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(SIFixWWMLiveness, DEBUG_TYPE, "SI fix WWM liveness", false, false) char SIFixWWMLiveness::ID = 0; @@ -115,89 +152,267 @@ FunctionPass *llvm::createSIFixWWMLivenessPass() { return new SIFixWWMLiveness(); } -void SIFixWWMLiveness::addDefs(const MachineInstr &MI, SparseBitVector<> &Regs) -{ - for (const MachineOperand &Op : MI.defs()) { - if (Op.isReg()) { - unsigned Reg = Op.getReg(); - if (TRI->isVGPR(*MRI, Reg)) - Regs.set(Reg); - } - } -} +bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << "SIFixWWMLiveness: function " << MF.getName() << "\n"); + bool Modified = false; + + // This doesn't actually need LiveIntervals, but we can preserve them. + LIS = getAnalysisIfAvailable<LiveIntervals>(); -bool SIFixWWMLiveness::runOnWWMInstruction(MachineInstr &WWM) { - MachineBasicBlock *MBB = WWM.getParent(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - // Compute the registers that are live out of MI by figuring out which defs - // are reachable from MI. - SparseBitVector<> LiveOut; + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MRI = &MF.getRegInfo(); - for (auto II = MachineBasicBlock::iterator(WWM), IE = - MBB->end(); II != IE; ++II) { - addDefs(*II, LiveOut); - } + DomTree = &getAnalysis<MachineDominatorTree>(); + LoopInfo = &getAnalysis<MachineLoopInfo>(); - for (df_iterator<MachineBasicBlock *> I = ++df_begin(MBB), - E = df_end(MBB); - I != E; ++I) { - for (const MachineInstr &MI : **I) { - addDefs(MI, LiveOut); + // Scan the function to find the WWM sections and the candidate registers for + // having liveness modified. + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() == AMDGPU::EXIT_WWM) + WWMs.push_back(&MI); + else { + for (MachineOperand &DefOpnd : MI.defs()) { + if (DefOpnd.isReg()) { + unsigned Reg = DefOpnd.getReg(); + if (TRI->isVGPR(*MRI, Reg)) + processDef(DefOpnd); + } + } + } } } + if (!WWMs.empty()) { + // Synthesize liveness over WWM sections as required. + for (auto ThenDef : ThenDefs) + Modified |= processThenDef(ThenDef); + for (auto LoopExitDef : LoopExitDefs) + Modified |= processLoopExitDef(LoopExitDef.first, LoopExitDef.second); + for (auto LoopPhiDef : LoopPhiDefs) + Modified |= processLoopPhiDef(LoopPhiDef.first, LoopPhiDef.second); + } - // Compute the registers that reach MI. - SparseBitVector<> Reachable; + WWMs.clear(); + ThenDefs.clear(); + LoopExitDefs.clear(); + LoopPhiDefs.clear(); - for (auto II = ++MachineBasicBlock::reverse_iterator(WWM), IE = - MBB->rend(); II != IE; ++II) { - addDefs(*II, Reachable); - } + return Modified; +} - for (idf_iterator<MachineBasicBlock *> I = ++idf_begin(MBB), - E = idf_end(MBB); - I != E; ++I) { - for (const MachineInstr &MI : **I) { - addDefs(MI, Reachable); +// During the function scan, process an operand that defines a VGPR. +// This categorizes the register and puts it in the appropriate list for later +// use when processing a WWM section. +void SIFixWWMLiveness::processDef(MachineOperand &DefOpnd) { + unsigned Reg = DefOpnd.getReg(); + // Get all the defining instructions. For convenience, make Defs[0] the def + // we are on now. + SmallVector<const MachineInstr *, 4> Defs; + Defs.push_back(DefOpnd.getParent()); + for (auto &MI : MRI->def_instructions(Reg)) { + if (&MI != DefOpnd.getParent()) + Defs.push_back(&MI); + } + // Check whether this def dominates all the others. If not, ignore this def. + // Either it is going to be processed when the scan encounters its other def + // that dominates all defs, or there is no def that dominates all others. + // The latter case is an eliminated phi from an if..else..endif or similar, + // which must be for uniform control flow so can be ignored. + // Because this pass runs shortly after PHIElimination, we assume that any + // multi-def register is a lowered phi, and thus has each def in a separate + // basic block. + for (unsigned I = 1; I != Defs.size(); ++I) { + if (!DomTree->dominates(Defs[0]->getParent(), Defs[I]->getParent())) + return; + } + // Check for the case of an if..endif lowered phi: It has two defs, one + // dominates the other, and there is a single use in a successor of the + // dominant def. + // Later we will spot any WWM code inside + // the "then" clause and turn the second def into a partial def so its + // liveness goes through the WWM code in the "then" clause. + if (Defs.size() == 2) { + auto DomDefBlock = Defs[0]->getParent(); + if (DomDefBlock->succ_size() == 2 && MRI->hasOneUse(Reg)) { + auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent(); + for (auto Succ : DomDefBlock->successors()) { + if (Succ == UseBlock) { + LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << " is a then phi reg\n"); + ThenDefs.push_back(&DefOpnd); + return; + } + } } } - - // find the intersection, and add implicit uses. - LiveOut &= Reachable; - - bool Modified = false; - for (unsigned Reg : LiveOut) { - WWM.addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true)); - if (LIS) { - // FIXME: is there a better way to update the live interval? - LIS->removeInterval(Reg); - LIS->createAndComputeVirtRegInterval(Reg); + // Check for the case of a non-lowered-phi register (single def) that exits + // a loop, that is, it has a use that is outside a loop that the def is + // inside. We find the outermost loop that the def is inside but a use is + // outside. Later we will spot any WWM code inside that loop and then make + // the def a partial def so its liveness goes round the loop and through the + // WWM code. + if (Defs.size() == 1) { + auto Loop = LoopInfo->getLoopFor(Defs[0]->getParent()); + if (!Loop) + return; + bool IsLoopExit = false; + for (auto &Use : MRI->use_instructions(Reg)) { + auto UseBlock = Use.getParent(); + if (Loop->contains(UseBlock)) + continue; + IsLoopExit = true; + while (auto Parent = Loop->getParentLoop()) { + if (Parent->contains(UseBlock)) + break; + Loop = Parent; + } } - Modified = true; + if (!IsLoopExit) + return; + LLVM_DEBUG(dbgs() << printReg(Reg, TRI) + << " is a loop exit reg with loop header at " + << "bb." << Loop->getHeader()->getNumber() << "\n"); + LoopExitDefs.push_back(std::pair<MachineOperand *, MachineLoop *>( + &DefOpnd, Loop)); + return; } - - return Modified; + // Check for the case of a lowered single-preheader-loop phi, that is, a + // multi-def register where the dominating def is in the loop pre-header and + // all other defs are in backedges. Later we will spot any WWM code inside + // that loop and then make the backedge defs partial defs so the liveness + // goes through the WWM code. + // Note that we are ignoring multi-preheader loops on the basis that the + // structurizer does not allow that for non-uniform loops. + // There must be a single use in the loop header. + if (!MRI->hasOneUse(Reg)) + return; + auto UseBlock = MRI->use_begin(Reg)->getParent()->getParent(); + auto Loop = LoopInfo->getLoopFor(UseBlock); + if (!Loop || Loop->getHeader() != UseBlock + || Loop->contains(Defs[0]->getParent())) { + LLVM_DEBUG(dbgs() << printReg(Reg, TRI) + << " is multi-def but single use not in loop header\n"); + return; + } + for (unsigned I = 1; I != Defs.size(); ++I) { + if (!Loop->contains(Defs[I]->getParent())) + return; + } + LLVM_DEBUG(dbgs() << printReg(Reg, TRI) + << " is a loop phi reg with loop header at " + << "bb." << Loop->getHeader()->getNumber() << "\n"); + LoopPhiDefs.push_back( + std::pair<MachineOperand *, MachineLoop *>(&DefOpnd, Loop)); } -bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) { - bool Modified = false; - - // This doesn't actually need LiveIntervals, but we can preserve them. - LIS = getAnalysisIfAvailable<LiveIntervals>(); - - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const SIInstrInfo *TII = ST.getInstrInfo(); - - TRI = &TII->getRegisterInfo(); - MRI = &MF.getRegInfo(); +// Process a then phi def: It has two defs, one dominates the other, and there +// is a single use in a successor of the dominant def. Here we spot any WWM +// code inside the "then" clause and turn the second def into a partial def so +// its liveness goes through the WWM code in the "then" clause. +bool SIFixWWMLiveness::processThenDef(MachineOperand *DefOpnd) { + LLVM_DEBUG(dbgs() << "Processing then def: " << *DefOpnd->getParent()); + if (DefOpnd->getParent()->getOpcode() == TargetOpcode::IMPLICIT_DEF) { + // Ignore if dominating def is undef. + LLVM_DEBUG(dbgs() << " ignoring as dominating def is undef\n"); + return false; + } + unsigned Reg = DefOpnd->getReg(); + // Get the use block, which is the endif block. + auto UseBlock = MRI->use_instr_begin(Reg)->getParent(); + // Check whether there is WWM code inside the then branch. The WWM code must + // be dominated by the if but not dominated by the endif. + bool ContainsWWM = false; + for (auto WWM : WWMs) { + if (DomTree->dominates(DefOpnd->getParent()->getParent(), WWM->getParent()) + && !DomTree->dominates(UseBlock, WWM->getParent())) { + LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM); + ContainsWWM = true; + break; + } + } + if (!ContainsWWM) + return false; + // Get the other def. + MachineInstr *OtherDef = nullptr; + for (auto &MI : MRI->def_instructions(Reg)) { + if (&MI != DefOpnd->getParent()) + OtherDef = &MI; + } + // Make it a partial def. + OtherDef->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true)); + LLVM_DEBUG(dbgs() << *OtherDef); + return true; +} - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - if (MI.getOpcode() == AMDGPU::EXIT_WWM) { - Modified |= runOnWWMInstruction(MI); - } +// Process a loop exit def, that is, a register with a single use in a loop +// that has a use outside the loop. Here we spot any WWM code inside that loop +// and then make the def a partial def so its liveness goes round the loop and +// through the WWM code. +bool SIFixWWMLiveness::processLoopExitDef(MachineOperand *DefOpnd, + MachineLoop *Loop) { + LLVM_DEBUG(dbgs() << "Processing loop exit def: " << *DefOpnd->getParent()); + // Check whether there is WWM code inside the loop. + bool ContainsWWM = false; + for (auto WWM : WWMs) { + if (Loop->contains(WWM->getParent())) { + LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM); + ContainsWWM = true; + break; } } + if (!ContainsWWM) + return false; + unsigned Reg = DefOpnd->getReg(); + // Add a new implicit_def in loop preheader(s). + for (auto Pred : Loop->getHeader()->predecessors()) { + if (!Loop->contains(Pred)) { + auto ImplicitDef = BuildMI(*Pred, Pred->getFirstTerminator(), DebugLoc(), + TII->get(TargetOpcode::IMPLICIT_DEF), Reg); + LLVM_DEBUG(dbgs() << *ImplicitDef); + (void)ImplicitDef; + } + } + // Make the original def partial. + DefOpnd->getParent()->addOperand(MachineOperand::CreateReg( + Reg, false, /*isImp=*/true)); + LLVM_DEBUG(dbgs() << *DefOpnd->getParent()); + return true; +} - return Modified; +// Process a loop phi def, that is, a multi-def register where the dominating +// def is in the loop pre-header and all other defs are in backedges. Here we +// spot any WWM code inside that loop and then make the backedge defs partial +// defs so the liveness goes through the WWM code. +bool SIFixWWMLiveness::processLoopPhiDef(MachineOperand *DefOpnd, + MachineLoop *Loop) { + LLVM_DEBUG(dbgs() << "Processing loop phi def: " << *DefOpnd->getParent()); + // Check whether there is WWM code inside the loop. + bool ContainsWWM = false; + for (auto WWM : WWMs) { + if (Loop->contains(WWM->getParent())) { + LLVM_DEBUG(dbgs() << " contains WWM: " << *WWM); + ContainsWWM = true; + break; + } + } + if (!ContainsWWM) + return false; + unsigned Reg = DefOpnd->getReg(); + // Remove kill mark from uses. + for (auto &Use : MRI->use_operands(Reg)) + Use.setIsKill(false); + // Make all defs except the dominating one partial defs. + SmallVector<MachineInstr *, 4> Defs; + for (auto &Def : MRI->def_instructions(Reg)) + Defs.push_back(&Def); + for (auto Def : Defs) { + if (DefOpnd->getParent() == Def) + continue; + Def->addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true)); + LLVM_DEBUG(dbgs() << *Def); + } + return true; } + diff --git a/lib/Target/AMDGPU/SIFixupVectorISel.cpp b/lib/Target/AMDGPU/SIFixupVectorISel.cpp new file mode 100644 index 0000000000000..ee39eb04d8316 --- /dev/null +++ b/lib/Target/AMDGPU/SIFixupVectorISel.cpp @@ -0,0 +1,231 @@ +//===-- SIFixupVectorISel.cpp - Fixup post ISel vector issues -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +/// SIFixupVectorISel pass cleans up post ISEL Vector issues. +/// Currently this will convert GLOBAL_{LOAD|STORE}_* +/// and GLOBAL_Atomic_* instructions into their _SADDR variants, +/// feeding the sreg into the saddr field of the new instruction. +/// We currently handle a REG_SEQUENCE feeding the vaddr +/// and decompose it into a base and index. +/// +/// Transform: +/// %17:vgpr_32, %19:sreg_64_xexec = V_ADD_I32_e64 %21:sgpr_32, %22:vgpr_32 +/// %18:vgpr_32, %20:sreg_64_xexec = V_ADDC_U32_e64 %25:vgpr_32, +/// %24:vgpr_32, %19:sreg_64_xexec +/// %16:vreg_64 = REG_SEQUENCE %17:vgpr_32, %sub0, %18:vgpr_32, %sub1 +/// %11:vreg_64 = COPY %16:vreg_64 +/// %10:vgpr_32 = GLOBAL_LOAD_DWORD killed %11:vreg_64, 16, 0, 0 +/// Into: +/// %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64, 36, 0 +/// %14:vreg_64 = REG_SEQUENCE %6:vgpr_32, %sub0, %15:vgpr_32, %sub1 +/// %10:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %14:vreg_64, %4:sreg_64_xexec,16... +/// +//===----------------------------------------------------------------------===// +// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" +#define DEBUG_TYPE "si-fixup-vector-isel" + +using namespace llvm; + +static cl::opt<bool> EnableGlobalSGPRAddr( + "amdgpu-enable-global-sgpr-addr", + cl::desc("Enable use of SGPR regs for GLOBAL LOAD/STORE instructions"), + cl::init(false)); + +STATISTIC(NumSGPRGlobalOccurs, "Number of global ld/st opportunities"); +STATISTIC(NumSGPRGlobalSaddrs, "Number of global sgpr instructions converted"); + +namespace { + +class SIFixupVectorISel : public MachineFunctionPass { +public: + static char ID; + +public: + SIFixupVectorISel() : MachineFunctionPass(ID) { + initializeSIFixupVectorISelPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIFixupVectorISel, DEBUG_TYPE, + "SI Fixup Vector ISel", false, false) + +char SIFixupVectorISel::ID = 0; + +char &llvm::SIFixupVectorISelID = SIFixupVectorISel::ID; + +FunctionPass *llvm::createSIFixupVectorISelPass() { + return new SIFixupVectorISel(); +} + +static bool findSRegBaseAndIndex(MachineOperand *Op, + unsigned &BaseReg, + unsigned &IndexReg, + MachineRegisterInfo &MRI, + const SIRegisterInfo *TRI) { + SmallVector<MachineOperand *, 8> Worklist; + Worklist.push_back(Op); + while (!Worklist.empty()) { + MachineOperand *WOp = Worklist.pop_back_val(); + if (!WOp->isReg() || + !TargetRegisterInfo::isVirtualRegister(WOp->getReg())) + continue; + MachineInstr *DefInst = MRI.getUniqueVRegDef(WOp->getReg()); + switch (DefInst->getOpcode()) { + default: + continue; + case AMDGPU::COPY: + Worklist.push_back(&DefInst->getOperand(1)); + break; + case AMDGPU::REG_SEQUENCE: + if (DefInst->getNumOperands() != 5) + continue; + Worklist.push_back(&DefInst->getOperand(1)); + Worklist.push_back(&DefInst->getOperand(3)); + break; + case AMDGPU::V_ADD_I32_e64: + // The V_ADD_* and its analogous V_ADDCV_* are generated by + // a previous pass which lowered from an ADD_64_PSEUDO, + // which generates subregs to break up the 64 bit args. + if (DefInst->getOperand(2).getSubReg() != AMDGPU::NoSubRegister) + continue; + BaseReg = DefInst->getOperand(2).getReg(); + if (DefInst->getOperand(3).getSubReg() != AMDGPU::NoSubRegister) + continue; + IndexReg = DefInst->getOperand(3).getReg(); + // Chase the IndexReg. + MachineInstr *MI = MRI.getUniqueVRegDef(IndexReg); + if (!MI || !MI->isCopy()) + continue; + // Make sure the reg class is 64 bit for Index. + // If the Index register is a subreg, we want it to reference + // a 64 bit register which we will use as the Index reg. + const TargetRegisterClass *IdxRC, *BaseRC; + IdxRC = MRI.getRegClass(MI->getOperand(1).getReg()); + if (AMDGPU::getRegBitWidth(IdxRC->getID()) != 64) + continue; + IndexReg = MI->getOperand(1).getReg(); + // Chase the BaseReg. + MI = MRI.getUniqueVRegDef(BaseReg); + if (!MI || !MI->isCopy()) + continue; + // Make sure the register class is 64 bit for Base. + BaseReg = MI->getOperand(1).getReg(); + BaseRC = MRI.getRegClass(BaseReg); + if (AMDGPU::getRegBitWidth(BaseRC->getID()) != 64) + continue; + // Make sure Base is SReg and Index is VReg. + if (!TRI->isSGPRReg(MRI, BaseReg)) + return false; + if (!TRI->hasVGPRs(MRI.getRegClass(IndexReg))) + return false; + // clear any killed flags on Index and Base regs, used later. + MRI.clearKillFlags(IndexReg); + MRI.clearKillFlags(BaseReg); + return true; + } + } + return false; +} + +// Identify Global LOAD|STORE/ATOMIC and try to convert to _SADDR. +static bool fixupGlobalSaddr(MachineBasicBlock &MBB, + MachineFunction &MF, + MachineRegisterInfo &MRI, + const GCNSubtarget &ST, + const SIInstrInfo *TII, + const SIRegisterInfo *TRI) { + if (!EnableGlobalSGPRAddr) + return false; + bool FuncModified = false; + MachineBasicBlock::iterator I, Next; + for (I = MBB.begin(); I != MBB.end(); I = Next) { + Next = std::next(I); + MachineInstr &MI = *I; + int NewOpcd = AMDGPU::getGlobalSaddrOp(MI.getOpcode()); + if (NewOpcd < 0) + continue; + // Update our statistics on opportunities seen. + ++NumSGPRGlobalOccurs; + LLVM_DEBUG(dbgs() << "Global Mem opp " << MI << '\n'); + // Need a Base and Index or we cant transform to _SADDR. + unsigned BaseReg = 0; + unsigned IndexReg = 0; + MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); + if (!findSRegBaseAndIndex(Op, BaseReg, IndexReg, MRI, TRI)) + continue; + ++NumSGPRGlobalSaddrs; + FuncModified = true; + // Create the new _SADDR Memory instruction. + bool HasVdst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst) != nullptr; + MachineOperand *VData = TII->getNamedOperand(MI, AMDGPU::OpName::vdata); + MachineInstr *NewGlob = nullptr; + NewGlob = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcd)); + if (HasVdst) + NewGlob->addOperand(MF, MI.getOperand(0)); + NewGlob->addOperand(MF, MachineOperand::CreateReg(IndexReg, false)); + if (VData) + NewGlob->addOperand(MF, *VData); + NewGlob->addOperand(MF, MachineOperand::CreateReg(BaseReg, false)); + NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::offset)); + + MachineOperand *Glc = TII->getNamedOperand(MI, AMDGPU::OpName::glc); + // Atomics dont have a GLC, so omit the field if not there. + if (Glc) + NewGlob->addOperand(MF, *Glc); + NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::slc)); + // _D16 have an vdst_in operand, copy it in. + MachineOperand *VDstInOp = TII->getNamedOperand(MI, + AMDGPU::OpName::vdst_in); + if (VDstInOp) + NewGlob->addOperand(MF, *VDstInOp); + NewGlob->copyImplicitOps(MF, MI); + NewGlob->cloneMemRefs(MF, MI); + // Remove the old Global Memop instruction. + MI.eraseFromParent(); + LLVM_DEBUG(dbgs() << "New Global Mem " << *NewGlob << '\n'); + } + return FuncModified; +} + +bool SIFixupVectorISel::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + MachineRegisterInfo &MRI = MF.getRegInfo(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + bool FuncModified = false; + for (MachineBasicBlock &MBB : MF) { + // Cleanup missed Saddr opportunites from ISel. + FuncModified |= fixupGlobalSaddr(MBB, MF, MRI, ST, TII, TRI); + } + return FuncModified; +} diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index 338cabcb906bc..f4e8669583699 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -35,13 +35,16 @@ struct FoldCandidate { uint64_t ImmToFold; int FrameIndexToFold; }; + int ShrinkOpcode; unsigned char UseOpNo; MachineOperand::MachineOperandType Kind; bool Commuted; FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp, - bool Commuted_ = false) : - UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()), + bool Commuted_ = false, + int ShrinkOp = -1) : + UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo), + Kind(FoldOp->getType()), Commuted(Commuted_) { if (FoldOp->isImm()) { ImmToFold = FoldOp->getImm(); @@ -68,6 +71,14 @@ struct FoldCandidate { bool isCommuted() const { return Commuted; } + + bool needsShrink() const { + return ShrinkOpcode != -1; + } + + int getShrinkOpcode() const { + return ShrinkOpcode; + } }; class SIFoldOperands : public MachineFunctionPass { @@ -154,6 +165,7 @@ FunctionPass *llvm::createSIFoldOperandsPass() { } static bool updateOperand(FoldCandidate &Fold, + const SIInstrInfo &TII, const TargetRegisterInfo &TRI) { MachineInstr *MI = Fold.UseMI; MachineOperand &Old = MI->getOperand(Fold.UseOpNo); @@ -189,10 +201,49 @@ static bool updateOperand(FoldCandidate &Fold, Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); } } + + if (Fold.needsShrink()) { + MachineBasicBlock *MBB = MI->getParent(); + auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI); + if (Liveness != MachineBasicBlock::LQR_Dead) + return false; + + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + int Op32 = Fold.getShrinkOpcode(); + MachineOperand &Dst0 = MI->getOperand(0); + MachineOperand &Dst1 = MI->getOperand(1); + assert(Dst0.isDef() && Dst1.isDef()); + + bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg()); + + const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg()); + unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC); + const TargetRegisterClass *Dst1RC = MRI.getRegClass(Dst1.getReg()); + unsigned NewReg1 = MRI.createVirtualRegister(Dst1RC); + + MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32); + + if (HaveNonDbgCarryUse) { + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg()) + .addReg(AMDGPU::VCC, RegState::Kill); + } + + // Keep the old instruction around to avoid breaking iterators, but + // replace the outputs with dummy registers. + Dst0.setReg(NewReg0); + Dst1.setReg(NewReg1); + + if (Fold.isCommuted()) + TII.commuteInstruction(*Inst32, false); + return true; + } + Old.ChangeToImmediate(Fold.ImmToFold); return true; } + assert(!Fold.needsShrink() && "not handled"); + if (Fold.isFI()) { Old.ChangeToFrameIndex(Fold.FrameIndexToFold); return true; @@ -261,6 +312,8 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, if (isUseMIInFoldList(FoldList, MI)) return false; + unsigned CommuteOpNo = OpNo; + // Operand is not legal, so try to commute the instruction to // see if this makes it possible to fold. unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex; @@ -269,11 +322,12 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, if (CanCommute) { if (CommuteIdx0 == OpNo) - OpNo = CommuteIdx1; + CommuteOpNo = CommuteIdx1; else if (CommuteIdx1 == OpNo) - OpNo = CommuteIdx0; + CommuteOpNo = CommuteIdx0; } + // One of operands might be an Imm operand, and OpNo may refer to it after // the call of commuteInstruction() below. Such situations are avoided // here explicitly as OpNo must be a register operand to be a candidate @@ -286,12 +340,34 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)) return false; - if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { + if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) { + if ((Opc == AMDGPU::V_ADD_I32_e64 || + Opc == AMDGPU::V_SUB_I32_e64 || + Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME + OpToFold->isImm()) { + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + + // Verify the other operand is a VGPR, otherwise we would violate the + // constant bus restriction. + unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0; + MachineOperand &OtherOp = MI->getOperand(OtherIdx); + if (!OtherOp.isReg() || + !TII->getRegisterInfo().isVGPR(MRI, OtherOp.getReg())) + return false; + + assert(MI->getOperand(1).isDef()); + + int Op32 = AMDGPU::getVOPe32(Opc); + FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true, + Op32)); + return true; + } + TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1); return false; } - FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold, true)); + FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true)); return true; } @@ -362,8 +438,6 @@ void SIFoldOperands::foldOperand( bool FoldingImm = OpToFold.isImm(); - // In order to fold immediates into copies, we need to change the - // copy to a MOV. if (FoldingImm && UseMI->isCopy()) { unsigned DestReg = UseMI->getOperand(0).getReg(); const TargetRegisterClass *DestRC @@ -371,6 +445,31 @@ void SIFoldOperands::foldOperand( MRI->getRegClass(DestReg) : TRI->getPhysRegClass(DestReg); + unsigned SrcReg = UseMI->getOperand(1).getReg(); + if (TargetRegisterInfo::isVirtualRegister(DestReg) && + TargetRegisterInfo::isVirtualRegister(SrcReg)) { + const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg); + if (TRI->isSGPRClass(SrcRC) && TRI->hasVGPRs(DestRC)) { + MachineRegisterInfo::use_iterator NextUse; + SmallVector<FoldCandidate, 4> CopyUses; + for (MachineRegisterInfo::use_iterator + Use = MRI->use_begin(DestReg), E = MRI->use_end(); + Use != E; Use = NextUse) { + NextUse = std::next(Use); + FoldCandidate FC = FoldCandidate(Use->getParent(), + Use.getOperandNo(), &UseMI->getOperand(1)); + CopyUses.push_back(FC); + } + for (auto & F : CopyUses) { + foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, + FoldList, CopiesToReplace); + } + } + } + + // In order to fold immediates into copies, we need to change the + // copy to a MOV. + unsigned MovOp = TII->getMovOpcode(DestRC); if (MovOp == AMDGPU::COPY) return; @@ -378,6 +477,20 @@ void SIFoldOperands::foldOperand( UseMI->setDesc(TII->get(MovOp)); CopiesToReplace.push_back(UseMI); } else { + if (UseMI->isCopy() && OpToFold.isReg() && + TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) && + TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(1).getReg()) && + TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()) && + !UseMI->getOperand(1).getSubReg()) { + UseMI->getOperand(1).setReg(OpToFold.getReg()); + UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); + UseMI->getOperand(1).setIsKill(false); + CopiesToReplace.push_back(UseMI); + OpToFold.setIsKill(false); + return; + } + const MCInstrDesc &UseDesc = UseMI->getDesc(); // Don't fold into target independent nodes. Target independent opcodes @@ -550,6 +663,19 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, if (!Src0->isImm() && !Src1->isImm()) return false; + if (MI->getOpcode() == AMDGPU::V_LSHL_OR_B32) { + if (Src0->isImm() && Src0->getImm() == 0) { + // v_lshl_or_b32 0, X, Y -> copy Y + // v_lshl_or_b32 0, X, K -> v_mov_b32 K + bool UseCopy = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->isReg(); + MI->RemoveOperand(Src1Idx); + MI->RemoveOperand(Src0Idx); + + MI->setDesc(TII->get(UseCopy ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32)); + return true; + } + } + // and k0, k1 -> v_mov_b32 (k0 & k1) // or k0, k1 -> v_mov_b32 (k0 | k1) // xor k0, k1 -> v_mov_b32 (k0 ^ k1) @@ -728,13 +854,17 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, } } else { // Folding register. + SmallVector <MachineRegisterInfo::use_iterator, 4> UsesToProcess; for (MachineRegisterInfo::use_iterator Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end(); Use != E; ++Use) { - MachineInstr *UseMI = Use->getParent(); + UsesToProcess.push_back(Use); + } + for (auto U : UsesToProcess) { + MachineInstr *UseMI = U->getParent(); - foldOperand(OpToFold, UseMI, Use.getOperandNo(), - FoldList, CopiesToReplace); + foldOperand(OpToFold, UseMI, U.getOperandNo(), + FoldList, CopiesToReplace); } } @@ -744,7 +874,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, Copy->addImplicitDefUseOperands(*MF); for (FoldCandidate &Fold : FoldList) { - if (updateOperand(Fold, *TRI)) { + if (updateOperand(Fold, *TII, *TRI)) { // Clear kill flags. if (Fold.isReg()) { assert(Fold.OpToFold && Fold.OpToFold->isReg()); @@ -981,9 +1111,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { // omod is ignored by hardware if IEEE bit is enabled. omod also does not // correctly handle signed zeros. // - // TODO: Check nsz on instructions when fast math flags are preserved to MI - // level. - bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath(); + bool IsIEEEMode = ST->enableIEEEBit(MF); + bool HasNSZ = MFI->hasNoSignedZerosFPMath(); for (MachineBasicBlock *MBB : depth_first(&MF)) { MachineBasicBlock::iterator I, Next; @@ -994,7 +1123,10 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { tryFoldInst(TII, &MI); if (!TII->isFoldableCopy(MI)) { - if (IsIEEEMode || !tryFoldOMod(MI)) + // TODO: Omod might be OK if there is NSZ only on the source + // instruction, and not the omod multiply. + if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) || + !tryFoldOMod(MI)) tryFoldClamp(MI); continue; } diff --git a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp index cd14239de822b..aa976d5141f86 100644 --- a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -168,16 +168,15 @@ void SIFormMemoryClauses::forAllLanes(unsigned Reg, LaneBitmask LaneMask, CoveringSubregs.push_back(Idx); } - llvm::sort(CoveringSubregs.begin(), CoveringSubregs.end(), - [this](unsigned A, unsigned B) { - LaneBitmask MaskA = TRI->getSubRegIndexLaneMask(A); - LaneBitmask MaskB = TRI->getSubRegIndexLaneMask(B); - unsigned NA = MaskA.getNumLanes(); - unsigned NB = MaskB.getNumLanes(); - if (NA != NB) - return NA > NB; - return MaskA.getHighestLane() > MaskB.getHighestLane(); - }); + llvm::sort(CoveringSubregs, [this](unsigned A, unsigned B) { + LaneBitmask MaskA = TRI->getSubRegIndexLaneMask(A); + LaneBitmask MaskB = TRI->getSubRegIndexLaneMask(B); + unsigned NA = MaskA.getNumLanes(); + unsigned NB = MaskB.getNumLanes(); + if (NA != NB) + return NA > NB; + return MaskA.getHighestLane() > MaskB.getHighestLane(); + }); for (unsigned Idx : CoveringSubregs) { LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx); diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index ac0ef90f25a4f..e4633c88e18ff 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -289,7 +289,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; - if (ST.isAmdCodeObjectV2(F)) { + if (ST.isAmdHsaOrMesa(F)) { PreloadedPrivateBufferReg = MFI->getPreloadedReg( AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); } @@ -308,7 +308,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, } if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) { - assert(ST.isAmdCodeObjectV2(F) || ST.isMesaGfxShader(F)); + assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F)); MRI.addLiveIn(PreloadedPrivateBufferReg); MBB.addLiveIn(PreloadedPrivateBufferReg); } @@ -333,7 +333,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, bool CopyBuffer = ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister && - ST.isAmdCodeObjectV2(F) && + ST.isAmdHsaOrMesa(F) && ScratchRsrcReg != PreloadedPrivateBufferReg; // This needs to be careful of the copying order to avoid overwriting one of @@ -433,7 +433,7 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, } if (ST.isMesaGfxShader(Fn) || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) { - assert(!ST.isAmdCodeObjectV2(Fn)); + assert(!ST.isAmdHsaOrMesa(Fn)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 25007861fd158..0ba921647097d 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12,7 +12,7 @@ // //===----------------------------------------------------------------------===// -#ifdef _MSC_VER +#if defined(_MSC_VER) || defined(__MINGW32__) // Provide M_PI. #define _USE_MATH_DEFINES #endif @@ -156,12 +156,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v8i32, Custom); setOperationAction(ISD::LOAD, MVT::v16i32, Custom); setOperationAction(ISD::LOAD, MVT::i1, Custom); + setOperationAction(ISD::LOAD, MVT::v32i32, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); setOperationAction(ISD::STORE, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::v16i32, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); + setOperationAction(ISD::STORE, MVT::v32i32, Custom); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); @@ -207,11 +209,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); @@ -232,6 +237,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::ADDCARRY, MVT::i32, Legal); setOperationAction(ISD::SUBCARRY, MVT::i32, Legal); + setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand); + setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand); + setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand); + #if 0 setOperationAction(ISD::ADDCARRY, MVT::i64, Legal); setOperationAction(ISD::SUBCARRY, MVT::i64, Legal); @@ -240,7 +249,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, - MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16 }) { + MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -339,6 +348,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->has16BitInsts()) { setOperationAction(ISD::FLOG, MVT::f16, Custom); + setOperationAction(ISD::FEXP, MVT::f16, Custom); setOperationAction(ISD::FLOG10, MVT::f16, Custom); } @@ -375,8 +385,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasBFE()) setHasExtractBitsInsn(true); - setOperationAction(ISD::FMINNUM, MVT::f64, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); + setOperationAction(ISD::FMINNUM, MVT::f32, Custom); + setOperationAction(ISD::FMAXNUM, MVT::f32, Custom); + setOperationAction(ISD::FMINNUM, MVT::f64, Custom); + setOperationAction(ISD::FMAXNUM, MVT::f64, Custom); + + + // These are really only legal for ieee_mode functions. We should be avoiding + // them for functions that don't have ieee_mode enabled, so just say they are + // legal. + setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal); + setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal); + setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal); + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); @@ -465,8 +487,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // F16 - VOP2 Actions. setOperationAction(ISD::BR_CC, MVT::f16, Expand); setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); - setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); - setOperationAction(ISD::FMINNUM, MVT::f16, Legal); + setOperationAction(ISD::FDIV, MVT::f16, Custom); // F16 - VOP3 Actions. @@ -549,6 +570,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // This isn't really legal, but this avoids the legalizer unrolling it (and // allows matching fneg (fabs x) patterns) setOperationAction(ISD::FABS, MVT::v2f16, Legal); + + setOperationAction(ISD::FMAXNUM, MVT::f16, Custom); + setOperationAction(ISD::FMINNUM, MVT::f16, Custom); + setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal); + setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal); + + setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom); + setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom); + + setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand); + setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand); } if (Subtarget->hasVOP3PInsts()) { @@ -566,8 +598,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FADD, MVT::v2f16, Legal); setOperationAction(ISD::FMUL, MVT::v2f16, Legal); setOperationAction(ISD::FMA, MVT::v2f16, Legal); - setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal); - setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal); + + setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal); + setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal); + setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); @@ -587,9 +621,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FADD, MVT::v4f16, Custom); setOperationAction(ISD::FMUL, MVT::v4f16, Custom); + + setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom); + setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom); + setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom); setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom); + setOperationAction(ISD::FEXP, MVT::v2f16, Custom); setOperationAction(ISD::SELECT, MVT::v4i16, Custom); setOperationAction(ISD::SELECT, MVT::v4f16, Custom); } @@ -623,6 +663,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); setTargetDAGCombine(ISD::FMAXNUM); + setTargetDAGCombine(ISD::FMINNUM_IEEE); + setTargetDAGCombine(ISD::FMAXNUM_IEEE); setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::SMIN); setTargetDAGCombine(ISD::SMAX); @@ -638,7 +680,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); - setTargetDAGCombine(ISD::BUILD_VECTOR); + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -707,9 +749,7 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, if (Size == 64) return MVT::i32; - if (Size == 16 && - Subtarget->has16BitInsts() && - isPowerOf2_32(VT.getVectorNumElements())) + if (Size == 16 && Subtarget->has16BitInsts()) return VT.isInteger() ? MVT::v2i16 : MVT::v2f16; } @@ -730,9 +770,8 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, if (Size == 64) return 2 * NumElts; - // FIXME: Fails to break down as we want with v3. - if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts)) - return VT.getVectorNumElements() / 2; + if (Size == 16 && Subtarget->has16BitInsts()) + return (VT.getVectorNumElements() + 1) / 2; } return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); @@ -763,10 +802,10 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( // FIXME: We should fix the ABI to be the same on targets without 16-bit // support, but unless we can properly handle 3-vectors, it will be still be // inconsistent. - if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts)) { + if (Size == 16 && Subtarget->has16BitInsts()) { RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; IntermediateVT = RegisterVT; - NumIntermediates = NumElts / 2; + NumIntermediates = (NumElts + 1) / 2; return NumIntermediates; } } @@ -775,6 +814,47 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); } +static MVT memVTFromAggregate(Type *Ty) { + // Only limited forms of aggregate type currently expected. + assert(Ty->isStructTy() && "Expected struct type"); + + + Type *ElementType = nullptr; + unsigned NumElts; + if (Ty->getContainedType(0)->isVectorTy()) { + VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0)); + ElementType = VecComponent->getElementType(); + NumElts = VecComponent->getNumElements(); + } else { + ElementType = Ty->getContainedType(0); + NumElts = 1; + } + + assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type"); + + // Calculate the size of the memVT type from the aggregate + unsigned Pow2Elts = 0; + unsigned ElementSize; + switch (ElementType->getTypeID()) { + default: + llvm_unreachable("Unknown type!"); + case Type::IntegerTyID: + ElementSize = cast<IntegerType>(ElementType)->getBitWidth(); + break; + case Type::HalfTyID: + ElementSize = 16; + break; + case Type::FloatTyID: + ElementSize = 32; + break; + } + unsigned AdditionalElts = ElementSize == 16 ? 2 : 1; + Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts); + + return MVT::getVectorVT(MVT::getVT(ElementType, false), + Pow2Elts); +} + bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, @@ -802,7 +882,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MODereferenceable; if (Attr.hasFnAttribute(Attribute::ReadOnly)) { Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getType()); + Info.memVT = MVT::getVT(CI.getType(), true); + if (Info.memVT == MVT::Other) { + // Some intrinsics return an aggregate type - special case to work out + // the correct memVT + Info.memVT = memVTFromAggregate(CI.getType()); + } Info.flags |= MachineMemOperand::MOLoad; } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) { Info.opc = ISD::INTRINSIC_VOID; @@ -941,11 +1026,11 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, if (AM.BaseGV) return false; - if (AS == AMDGPUASI.GLOBAL_ADDRESS) + if (AS == AMDGPUAS::GLOBAL_ADDRESS) return isLegalGlobalAddressingMode(AM); - if (AS == AMDGPUASI.CONSTANT_ADDRESS || - AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) { + if (AS == AMDGPUAS::CONSTANT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { // If the offset isn't a multiple of 4, it probably isn't going to be // correctly aligned. // FIXME: Can we get the real alignment here? @@ -983,10 +1068,10 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return false; - } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) { + } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { return isLegalMUBUFAddressingMode(AM); - } else if (AS == AMDGPUASI.LOCAL_ADDRESS || - AS == AMDGPUASI.REGION_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || + AS == AMDGPUAS::REGION_ADDRESS) { // Basic, single offset DS instructions allow a 16-bit unsigned immediate // field. // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have @@ -1001,8 +1086,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return true; return false; - } else if (AS == AMDGPUASI.FLAT_ADDRESS || - AS == AMDGPUASI.UNKNOWN_ADDRESS_SPACE) { + } else if (AS == AMDGPUAS::FLAT_ADDRESS || + AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) { // For an unknown address space, this usually means that this is for some // reason being used for pure arithmetic, and not based on some addressing // computation. We don't have instructions that compute pointers with any @@ -1016,12 +1101,12 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, const SelectionDAG &DAG) const { - if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) { + if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) { return (MemVT.getSizeInBits() <= 4 * 32); - } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) { + } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); return (MemVT.getSizeInBits() <= MaxPrivateBits); - } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { return (MemVT.getSizeInBits() <= 2 * 32); } return true; @@ -1043,8 +1128,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, return false; } - if (AddrSpace == AMDGPUASI.LOCAL_ADDRESS || - AddrSpace == AMDGPUASI.REGION_ADDRESS) { + if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || + AddrSpace == AMDGPUAS::REGION_ADDRESS) { // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte // aligned, 8 byte access in a single operation using ds_read2/write2_b32 // with adjacent offsets. @@ -1059,17 +1144,21 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // will access scratch. If we had access to the IR function, then we // could determine if any private memory was used in the function. if (!Subtarget->hasUnalignedScratchAccess() && - (AddrSpace == AMDGPUASI.PRIVATE_ADDRESS || - AddrSpace == AMDGPUASI.FLAT_ADDRESS)) { - return false; + (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || + AddrSpace == AMDGPUAS::FLAT_ADDRESS)) { + bool AlignedBy4 = Align >= 4; + if (IsFast) + *IsFast = AlignedBy4; + + return AlignedBy4; } if (Subtarget->hasUnalignedBufferAccess()) { // If we have an uniform constant load, it still requires using a slow // buffer instruction if unaligned. if (IsFast) { - *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS || - AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ? + *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || + AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ? (Align % 4 == 0) : true; } @@ -1109,17 +1198,15 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, return MVT::Other; } -static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) { - return AS == AMDGPUASI.GLOBAL_ADDRESS || - AS == AMDGPUASI.FLAT_ADDRESS || - AS == AMDGPUASI.CONSTANT_ADDRESS || - AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT; +static bool isFlatGlobalAddrSpace(unsigned AS) { + return AS == AMDGPUAS::GLOBAL_ADDRESS || + AS == AMDGPUAS::FLAT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS; } bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { - return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) && - isFlatGlobalAddrSpace(DestAS, AMDGPUASI); + return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); } bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { @@ -1133,7 +1220,7 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { // Flat -> private/local is a simple truncate. // Flat -> global is no-op - if (SrcAS == AMDGPUASI.FLAT_ADDRESS) + if (SrcAS == AMDGPUAS::FLAT_ADDRESS) return true; return isNoopAddrSpaceCast(SrcAS, DestAS); @@ -1146,7 +1233,7 @@ bool SITargetLowering::isMemOpUniform(const SDNode *N) const { } TargetLoweringBase::LegalizeTypeAction -SITargetLowering::getPreferredVectorAction(EVT VT) const { +SITargetLowering::getPreferredVectorAction(MVT VT) const { if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) return TypeSplitVector; @@ -1200,7 +1287,7 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); - MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS); + MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT); @@ -1240,7 +1327,7 @@ SDValue SITargetLowering::lowerKernargMemParameter( uint64_t Offset, unsigned Align, bool Signed, const ISD::InputArg *Arg) const { Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); - PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS); + PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); // Try to avoid using an extload by loading earlier than the argument address, @@ -1349,7 +1436,8 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { const ISD::InputArg *Arg = &Ins[I]; - assert(!Arg->VT.isVector() && "vector type argument should have been split"); + assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) && + "vector type argument should have been split"); // First check if it's a PS input addr. if (CallConv == CallingConv::AMDGPU_PS && @@ -1642,7 +1730,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - if (ST.isAmdCodeObjectV2(MF.getFunction())) { + if (ST.isAmdHsaOrMesa(MF.getFunction())) { if (RequiresStackAccess) { // If we have stack objects, we unquestionably need the private buffer // resource. For the Code Object V2 ABI, this will be the first 4 user @@ -1951,29 +2039,6 @@ SDValue SITargetLowering::LowerFormalArguments( llvm_unreachable("Unknown loc info!"); } - if (IsShader && Arg.VT.isVector()) { - // Build a vector from the registers - Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); - - SmallVector<SDValue, 4> Regs; - Regs.push_back(Val); - for (unsigned j = 1; j != NumElements; ++j) { - Reg = ArgLocs[ArgIdx++].getLocReg(); - Reg = MF.addLiveIn(Reg, RC); - - SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); - Regs.push_back(Copy); - } - - // Fill up the missing vector elements - NumElements = Arg.VT.getVectorNumElements() - NumElements; - Regs.append(NumElements, DAG.getUNDEF(VT)); - - InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs)); - continue; - } - InVals.push_back(Val); } @@ -2037,48 +2102,19 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsShader = AMDGPU::isShader(CallConv); - Info->setIfReturnsVoid(Outs.size() == 0); + Info->setIfReturnsVoid(Outs.empty()); bool IsWaveEnd = Info->returnsVoid() && IsShader; - SmallVector<ISD::OutputArg, 48> Splits; - SmallVector<SDValue, 48> SplitVals; - - // Split vectors into their elements. - for (unsigned i = 0, e = Outs.size(); i != e; ++i) { - const ISD::OutputArg &Out = Outs[i]; - - if (IsShader && Out.VT.isVector()) { - MVT VT = Out.VT.getVectorElementType(); - ISD::OutputArg NewOut = Out; - NewOut.Flags.setSplit(); - NewOut.VT = VT; - - // We want the original number of vector elements here, e.g. - // three or five, not four or eight. - unsigned NumElements = Out.ArgVT.getVectorNumElements(); - - for (unsigned j = 0; j != NumElements; ++j) { - SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i], - DAG.getConstant(j, DL, MVT::i32)); - SplitVals.push_back(Elem); - Splits.push_back(NewOut); - NewOut.PartOffset += NewOut.VT.getStoreSize(); - } - } else { - SplitVals.push_back(OutVals[i]); - Splits.push_back(Out); - } - } - // CCValAssign - represent the assignment of the return value to a location. SmallVector<CCValAssign, 48> RVLocs; + SmallVector<ISD::OutputArg, 48> Splits; // CCState - Info about the registers and stack slots. CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); // Analyze outgoing return values. - CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg)); + CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); SDValue Flag; SmallVector<SDValue, 48> RetOps; @@ -2103,14 +2139,12 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, } // Copy the result values into the output registers. - for (unsigned i = 0, realRVLocIdx = 0; - i != RVLocs.size(); - ++i, ++realRVLocIdx) { - CCValAssign &VA = RVLocs[i]; + for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E; + ++I, ++RealRVLocIdx) { + CCValAssign &VA = RVLocs[I]; assert(VA.isRegLoc() && "Can only return in registers!"); // TODO: Partially return in registers if return values don't fit. - - SDValue Arg = SplitVals[realRVLocIdx]; + SDValue Arg = OutVals[RealRVLocIdx]; // Copied from other backends. switch (VA.getLocInfo()) { @@ -2225,11 +2259,11 @@ SDValue SITargetLowering::LowerCallResult( // from the explicit user arguments present in the IR. void SITargetLowering::passSpecialInputs( CallLoweringInfo &CLI, + CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, SmallVectorImpl<SDValue> &MemOpChains, - SDValue Chain, - SDValue StackPtr) const { + SDValue Chain) const { // If we don't have a call site, this was a call inserted by // legalization. These can never use special inputs. if (!CLI.CS) @@ -2297,9 +2331,9 @@ void SITargetLowering::passSpecialInputs( if (OutgoingArg->isRegister()) { RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); } else { - SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr, - InputReg, - OutgoingArg->getStackOffset()); + unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4); + SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg, + SpecialArgOffset); MemOpChains.push_back(ArgStore); } } @@ -2424,6 +2458,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, "unsupported call to variadic function "); } + if (!CLI.CS.getInstruction()) + report_fatal_error("unsupported libcall legalization"); + if (!CLI.CS.getCalledFunction()) { return lowerUnhandledCall(CLI, InVals, "unsupported indirect call to function "); @@ -2442,8 +2479,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } // The first 4 bytes are reserved for the callee's emergency stack slot. - const unsigned CalleeUsableStackOffset = 4; - if (IsTailCall) { IsTailCall = isEligibleForTailCallOptimization( Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); @@ -2463,25 +2498,16 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, ++NumTailCalls; } - if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) { - // FIXME: Remove this hack for function pointer types after removing - // support of old address space mapping. In the new address space - // mapping the pointer in default address space is 64 bit, therefore - // does not need this hack. - if (Callee.getValueType() == MVT::i32) { - const GlobalValue *GV = GA->getGlobal(); - Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(), false, - GA->getTargetFlags()); - } - } - assert(Callee.getValueType() == MVT::i64); - const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); + + // The first 4 bytes are reserved for the callee's emergency stack slot. + CCInfo.AllocateStack(4, 4); + CCInfo.AnalyzeCallOperands(Outs, AssignFn); // Get a count of how many bytes are to be pushed on the stack. @@ -2529,10 +2555,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } } - // Stack pointer relative accesses are done by changing the offset SGPR. This - // is just the VGPR offset component. - SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32); - SmallVector<SDValue, 8> MemOpChains; MVT PtrVT = MVT::i32; @@ -2576,18 +2598,22 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, unsigned LocMemOffset = VA.getLocMemOffset(); int32_t Offset = LocMemOffset; - SDValue PtrOff = DAG.getObjectPtrOffset(DL, StackPtr, Offset); + SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT); + unsigned Align = 0; if (IsTailCall) { ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() : VA.getValVT().getStoreSize(); + // FIXME: We can have better than the minimum byval required alignment. + Align = Flags.isByVal() ? Flags.getByValAlign() : + MinAlign(Subtarget->getStackAlignment(), Offset); + Offset = Offset + FPDiff; int FI = MFI.CreateFixedObject(OpSize, Offset, true); - DstAddr = DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT), - StackPtr); + DstAddr = DAG.getFrameIndex(FI, PtrVT); DstInfo = MachinePointerInfo::getFixedStack(MF, FI); // Make sure any stack arguments overlapping with where we're storing @@ -2601,6 +2627,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } else { DstAddr = PtrOff; DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset); + Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset); } if (Outs[i].Flags.isByVal()) { @@ -2611,18 +2638,18 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, /*isVol = */ false, /*AlwaysInline = */ true, /*isTailCall = */ false, DstInfo, MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy( - *DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS)))); + *DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS)))); MemOpChains.push_back(Cpy); } else { - SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo); + SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align); MemOpChains.push_back(Store); } } } // Copy special input registers after user input arguments. - passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr); + passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); @@ -3460,7 +3487,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) MIB.add(MI.getOperand(I)); - MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MIB.cloneMemRefs(MI); MI.eraseFromParent(); return BB; } @@ -3628,7 +3655,11 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerDEBUGTRAP(Op, DAG); case ISD::FABS: case ISD::FNEG: + case ISD::FCANONICALIZE: return splitUnaryVectorOp(Op, DAG); + case ISD::FMINNUM: + case ISD::FMAXNUM: + return lowerFMINNUM_FMAXNUM(Op, DAG); case ISD::SHL: case ISD::SRA: case ISD::SRL: @@ -3639,10 +3670,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SMAX: case ISD::UMIN: case ISD::UMAX: - case ISD::FMINNUM: - case ISD::FMAXNUM: case ISD::FADD: case ISD::FMUL: + case ISD::FMINNUM_IEEE: + case ISD::FMAXNUM_IEEE: return splitBinaryVectorOp(Op, DAG); } return SDValue(); @@ -3678,18 +3709,9 @@ static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M, SelectionDAG &DAG, + ArrayRef<SDValue> Ops, bool IsIntrinsic) const { SDLoc DL(M); - SmallVector<SDValue, 10> Ops; - Ops.reserve(M->getNumOperands()); - - Ops.push_back(M->getOperand(0)); - if (IsIntrinsic) - Ops.push_back(DAG.getConstant(Opcode, DL, MVT::i32)); - - // Skip 1, as it is the intrinsic ID. - for (unsigned I = 2, E = M->getNumOperands(); I != E; ++I) - Ops.push_back(M->getOperand(I)); bool Unpacked = Subtarget->hasUnpackedD16VMem(); EVT LoadVT = M->getValueType(0); @@ -3717,6 +3739,69 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL); } +static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, + SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3)); + if (!CD) + return DAG.getUNDEF(VT); + + int CondCode = CD->getSExtValue(); + if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE || + CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE) + return DAG.getUNDEF(VT); + + ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode); + + + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + + SDLoc DL(N); + + EVT CmpVT = LHS.getValueType(); + if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) { + unsigned PromoteOp = ICmpInst::isSigned(IcInput) ? + ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS); + RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS); + } + + ISD::CondCode CCOpcode = getICmpCondCode(IcInput); + + return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS, + DAG.getCondCode(CCOpcode)); +} + +static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, + SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3)); + if (!CD) + return DAG.getUNDEF(VT); + + int CondCode = CD->getSExtValue(); + if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE || + CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) { + return DAG.getUNDEF(VT); + } + + SDValue Src0 = N->getOperand(1); + SDValue Src1 = N->getOperand(2); + EVT CmpVT = Src0.getValueType(); + SDLoc SL(N); + + if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) { + Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0); + Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1); + } + + FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode); + ISD::CondCode CCOpcode = getFCmpCondCode(IcInput); + return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0, + Src1, DAG.getCondCode(CCOpcode)); +} + void SITargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { @@ -3761,8 +3846,13 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, else Opcode = AMDGPUISD::CVT_PK_U16_U32; - SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1); - Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt)); + EVT VT = N->getValueType(0); + if (isTypeLegal(VT)) + Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1)); + else { + SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1); + Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt)); + } return; } } @@ -3895,15 +3985,15 @@ void SITargetLowering::createDebuggerPrologueStackObjects( bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { const Triple &TT = getTargetMachine().getTargetTriple(); - return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || - GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && + return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || + GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && AMDGPU::shouldEmitConstantsToTextSection(TT); } bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { - return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || - GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || - GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && + return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || + GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); } @@ -4038,6 +4128,23 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc); } +SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); + + // FIXME: Assert during eslection that this is only selected for + // ieee_mode. Currently a combine can produce the ieee version for non-ieee + // mode functions, but this happens to be OK since it's only done in cases + // where there is known no sNaN. + if (IsIEEEMode) + return expandFMINNUM_FMAXNUM(Op.getNode(), DAG); + + if (VT == MVT::v4f16) + return splitBinaryVectorOp(Op, DAG); + return Op; +} + SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Chain = Op.getOperand(0); @@ -4091,10 +4198,10 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, SelectionDAG &DAG) const { // FIXME: Use inline constants (src_{shared, private}_base) instead. if (Subtarget->hasApertureRegs()) { - unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ? + unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; - unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ? + unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; unsigned Encoding = @@ -4119,7 +4226,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, // Offset into amd_queue_t for group_segment_aperture_base_hi / // private_segment_aperture_base_hi. - uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44; + uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset); @@ -4127,7 +4234,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, // TODO: We should use the value from the IR intrinsic call, but it might not // be available and how do we get it? Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()), - AMDGPUASI.CONSTANT_ADDRESS)); + AMDGPUAS::CONSTANT_ADDRESS)); MachinePointerInfo PtrInfo(V, StructOffset); return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo, @@ -4148,11 +4255,11 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, static_cast<const AMDGPUTargetMachine &>(getTargetMachine()); // flat -> local/private - if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) { + if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { unsigned DestAS = ASC->getDestAddressSpace(); - if (DestAS == AMDGPUASI.LOCAL_ADDRESS || - DestAS == AMDGPUASI.PRIVATE_ADDRESS) { + if (DestAS == AMDGPUAS::LOCAL_ADDRESS || + DestAS == AMDGPUAS::PRIVATE_ADDRESS) { unsigned NullVal = TM.getNullPointerValue(DestAS); SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); @@ -4164,11 +4271,11 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, } // local/private -> flat - if (ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS) { + if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { unsigned SrcAS = ASC->getSrcAddressSpace(); - if (SrcAS == AMDGPUASI.LOCAL_ADDRESS || - SrcAS == AMDGPUASI.PRIVATE_ADDRESS) { + if (SrcAS == AMDGPUAS::LOCAL_ADDRESS || + SrcAS == AMDGPUAS::PRIVATE_ADDRESS) { unsigned NullVal = TM.getNullPointerValue(SrcAS); SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32); @@ -4335,30 +4442,39 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, } assert(VT == MVT::v2f16 || VT == MVT::v2i16); + assert(!Subtarget->hasVOP3PInsts() && "this should be legal"); SDValue Lo = Op.getOperand(0); SDValue Hi = Op.getOperand(1); - Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); - Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi); + // Avoid adding defined bits with the zero_extend. + if (Hi.isUndef()) { + Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); + SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo); + return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo); + } - Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo); + Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi); Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi); SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi, DAG.getConstant(16, SL, MVT::i32)); + if (Lo.isUndef()) + return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi); - SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi); + Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo); + Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo); + SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi); return DAG.getNode(ISD::BITCAST, SL, VT, Or); } bool SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // We can fold offsets for anything that doesn't require a GOT relocation. - return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || - GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || - GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && + return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || + GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && !shouldEmitGOTReloc(GA->getGlobal()); } @@ -4409,18 +4525,15 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SelectionDAG &DAG) const { GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = GSD->getGlobal(); - - if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS && - GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT && - GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS && - // FIXME: It isn't correct to rely on the type of the pointer. This should - // be removed when address space 0 is 64-bit. - !GV->getType()->getElementType()->isFunctionTy()) + if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS || + GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); SDLoc DL(GSD); EVT PtrVT = Op.getValueType(); + // FIXME: Should not make address space based decisions here. if (shouldEmitFixup(GV)) return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); else if (shouldEmitPCReloc(GV)) @@ -4431,11 +4544,11 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SIInstrInfo::MO_GOTPCREL32); Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext()); - PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS); + PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); const DataLayout &DataLayout = DAG.getDataLayout(); unsigned Align = DataLayout.getABITypeAlignment(PtrTy); - // FIXME: Use a PseudoSourceValue once those can be assigned an address space. - MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); + MachinePointerInfo PtrInfo + = MachinePointerInfo::getGOT(DAG.getMachineFunction()); return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align, MachineMemOperand::MODereferenceable | @@ -4547,11 +4660,115 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG, return Value == 0; } +// Re-construct the required return value for a image load intrinsic. +// This is more complicated due to the optional use TexFailCtrl which means the required +// return type is an aggregate +static SDValue constructRetValue(SelectionDAG &DAG, + MachineSDNode *Result, + ArrayRef<EVT> ResultTypes, + bool IsTexFail, bool Unpacked, bool IsD16, + int DMaskPop, int NumVDataDwords, + const SDLoc &DL, LLVMContext &Context) { + // Determine the required return type. This is the same regardless of IsTexFail flag + EVT ReqRetVT = ResultTypes[0]; + EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT; + int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1; + EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT; + EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts) + : AdjEltVT + : ReqRetVT; + + // Extract data part of the result + // Bitcast the result to the same type as the required return type + int NumElts; + if (IsD16 && !Unpacked) + NumElts = NumVDataDwords << 1; + else + NumElts = NumVDataDwords; + + EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts) + : AdjEltVT; + + // Special case for v8f16. Rather than add support for this, use v4i32 to + // extract the data elements + bool V8F16Special = false; + if (CastVT == MVT::v8f16) { + CastVT = MVT::v4i32; + DMaskPop >>= 1; + ReqRetNumElts >>= 1; + V8F16Special = true; + AdjVT = MVT::v2i32; + } + + SDValue N = SDValue(Result, 0); + SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N); + + // Iterate over the result + SmallVector<SDValue, 4> BVElts; + + if (CastVT.isVector()) { + DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop); + } else { + BVElts.push_back(CastRes); + } + int ExtraElts = ReqRetNumElts - DMaskPop; + while(ExtraElts--) + BVElts.push_back(DAG.getUNDEF(AdjEltVT)); + + SDValue PreTFCRes; + if (ReqRetNumElts > 1) { + SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts); + if (IsD16 && Unpacked) + PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked); + else + PreTFCRes = NewVec; + } else { + PreTFCRes = BVElts[0]; + } + + if (V8F16Special) + PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes); + + if (!IsTexFail) { + if (Result->getNumValues() > 1) + return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL); + else + return PreTFCRes; + } + + // Extract the TexFail result and insert into aggregate return + SmallVector<SDValue, 1> TFCElt; + DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1); + SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]); + return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL); +} + +static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, + SDValue *LWE, bool &IsTexFail) { + auto TexFailCtrlConst = dyn_cast<ConstantSDNode>(TexFailCtrl.getNode()); + if (!TexFailCtrlConst) + return false; + + uint64_t Value = TexFailCtrlConst->getZExtValue(); + if (Value) { + IsTexFail = true; + } + + SDLoc DL(TexFailCtrlConst); + *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32); + Value &= ~(uint64_t)0x1; + *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32); + Value &= ~(uint64_t)0x2; + + return Value == 0; +} + SDValue SITargetLowering::lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, SelectionDAG &DAG) const { SDLoc DL(Op); MachineFunction &MF = DAG.getMachineFunction(); + const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>(); const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); @@ -4559,12 +4776,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op, AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); unsigned IntrOpcode = Intr->BaseOpcode; - SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end()); + SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end()); + SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end()); bool IsD16 = false; + bool IsA16 = false; SDValue VData; int NumVDataDwords; + bool AdjustRetType = false; + unsigned AddrIdx; // Index of first address argument unsigned DMask; + unsigned DMaskLanes = 0; if (BaseOpcode->Atomic) { VData = Op.getOperand(2); @@ -4587,7 +4809,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op, AddrIdx = 3; } } else { - unsigned DMaskIdx; + unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1; + auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx)); + if (!DMaskConst) + return Op; + DMask = DMaskConst->getZExtValue(); + DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); if (BaseOpcode->Store) { VData = Op.getOperand(2); @@ -4603,58 +4830,91 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32; - DMaskIdx = 3; } else { - MVT LoadVT = Op.getSimpleValueType(); + // Work out the num dwords based on the dmask popcount and underlying type + // and whether packing is supported. + MVT LoadVT = ResultTypes[0].getSimpleVT(); if (LoadVT.getScalarType() == MVT::f16) { if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS || !BaseOpcode->HasD16) return Op; // D16 is unsupported for this instruction IsD16 = true; - if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem()) - ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32; } - NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32; - DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1; - } + // Confirm that the return type is large enough for the dmask specified + if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) || + (!LoadVT.isVector() && DMaskLanes > 1)) + return Op; - auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx)); - if (!DMaskConst) - return Op; + if (IsD16 && !Subtarget->hasUnpackedD16VMem()) + NumVDataDwords = (DMaskLanes + 1) / 2; + else + NumVDataDwords = DMaskLanes; - AddrIdx = DMaskIdx + 1; - DMask = DMaskConst->getZExtValue(); - if (!DMask && !BaseOpcode->Store) { - // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they - // store the channels' default values. - SDValue Undef = DAG.getUNDEF(Op.getValueType()); - if (isa<MemSDNode>(Op)) - return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL); - return Undef; + AdjustRetType = true; } + + AddrIdx = DMaskIdx + 1; } - unsigned NumVAddrs = BaseOpcode->NumExtraArgs + - (BaseOpcode->Gradients ? DimInfo->NumGradients : 0) + - (BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) + - (BaseOpcode->LodOrClampOrMip ? 1 : 0); + unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0; + unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0; + unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0; + unsigned NumVAddrs = BaseOpcode->NumExtraArgs + NumGradients + + NumCoords + NumLCM; + unsigned NumMIVAddrs = NumVAddrs; + SmallVector<SDValue, 4> VAddrs; - for (unsigned i = 0; i < NumVAddrs; ++i) - VAddrs.push_back(Op.getOperand(AddrIdx + i)); // Optimize _L to _LZ when _L is zero if (LZMappingInfo) { if (auto ConstantLod = - dyn_cast<ConstantFPSDNode>(VAddrs[NumVAddrs-1].getNode())) { + dyn_cast<ConstantFPSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) { if (ConstantLod->isZero() || ConstantLod->isNegative()) { IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l - VAddrs.pop_back(); // remove 'lod' + NumMIVAddrs--; // remove 'lod' } } } + // Check for 16 bit addresses and pack if true. + unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; + MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType(); + const MVT VAddrScalarVT = VAddrVT.getScalarType(); + if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16)) && + ST->hasFeature(AMDGPU::FeatureR128A16)) { + IsA16 = true; + const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; + for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) { + SDValue AddrLo, AddrHi; + // Push back extra arguments. + if (i < DimIdx) { + AddrLo = Op.getOperand(i); + } else { + AddrLo = Op.getOperand(i); + // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, + // in 1D, derivatives dx/dh and dx/dv are packed with undef. + if (((i + 1) >= (AddrIdx + NumMIVAddrs)) || + ((NumGradients / 2) % 2 == 1 && + (i == DimIdx + (NumGradients / 2) - 1 || + i == DimIdx + NumGradients - 1))) { + AddrHi = DAG.getUNDEF(MVT::f16); + } else { + AddrHi = Op.getOperand(i + 1); + i++; + } + AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorVT, + {AddrLo, AddrHi}); + AddrLo = DAG.getBitcast(MVT::i32, AddrLo); + } + VAddrs.push_back(AddrLo); + } + } else { + for (unsigned i = 0; i < NumMIVAddrs; ++i) + VAddrs.push_back(Op.getOperand(AddrIdx + i)); + } + SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs); SDValue True = DAG.getTargetConstant(1, DL, MVT::i1); @@ -4674,11 +4934,53 @@ SDValue SITargetLowering::lowerImage(SDValue Op, CtrlIdx = AddrIdx + NumVAddrs + 3; } + SDValue TFE; + SDValue LWE; SDValue TexFail = Op.getOperand(CtrlIdx); - auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode()); - if (!TexFailConst || TexFailConst->getZExtValue() != 0) + bool IsTexFail = false; + if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail)) return Op; + if (IsTexFail) { + if (!DMaskLanes) { + // Expecting to get an error flag since TFC is on - and dmask is 0 + // Force dmask to be at least 1 otherwise the instruction will fail + DMask = 0x1; + DMaskLanes = 1; + NumVDataDwords = 1; + } + NumVDataDwords += 1; + AdjustRetType = true; + } + + // Has something earlier tagged that the return type needs adjusting + // This happens if the instruction is a load or has set TexFailCtrl flags + if (AdjustRetType) { + // NumVDataDwords reflects the true number of dwords required in the return type + if (DMaskLanes == 0 && !BaseOpcode->Store) { + // This is a no-op load. This can be eliminated + SDValue Undef = DAG.getUNDEF(Op.getValueType()); + if (isa<MemSDNode>(Op)) + return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL); + return Undef; + } + + // Have to use a power of 2 number of dwords + NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords); + + EVT NewVT = NumVDataDwords > 1 ? + EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords) + : MVT::f32; + + ResultTypes[0] = NewVT; + if (ResultTypes.size() == 3) { + // Original result was aggregate type used for TexFailCtrl results + // The actual instruction returns as a vector type which has now been + // created. Remove the aggregate result. + ResultTypes.erase(&ResultTypes[1]); + } + } + SDValue GLC; SDValue SLC; if (BaseOpcode->Atomic) { @@ -4701,9 +5003,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op, Ops.push_back(Unorm); Ops.push_back(GLC); Ops.push_back(SLC); - Ops.push_back(False); // r128 - Ops.push_back(False); // tfe - Ops.push_back(False); // lwe + Ops.push_back(IsA16 && // a16 or r128 + ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False); + Ops.push_back(TFE); // tfe + Ops.push_back(LWE); // lwe Ops.push_back(DimInfo->DA ? True : False); if (BaseOpcode->HasD16) Ops.push_back(IsD16 ? True : False); @@ -4723,25 +5026,90 @@ SDValue SITargetLowering::lowerImage(SDValue Op, MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops); if (auto MemOp = dyn_cast<MemSDNode>(Op)) { - MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1); - *MemRefs = MemOp->getMemOperand(); - NewNode->setMemRefs(MemRefs, MemRefs + 1); + MachineMemOperand *MemRef = MemOp->getMemOperand(); + DAG.setNodeMemRefs(NewNode, {MemRef}); } if (BaseOpcode->AtomicX2) { SmallVector<SDValue, 1> Elt; DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1); return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL); - } else if (IsD16 && !BaseOpcode->Store) { - MVT LoadVT = Op.getSimpleValueType(); - SDValue Adjusted = adjustLoadValueTypeImpl( - SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem()); - return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL); + } else if (!BaseOpcode->Store) { + return constructRetValue(DAG, NewNode, + OrigResultTypes, IsTexFail, + Subtarget->hasUnpackedD16VMem(), IsD16, + DMaskLanes, NumVDataDwords, DL, + *DAG.getContext()); } return SDValue(NewNode, 0); } +SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, + SDValue Offset, SDValue GLC, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + VT.getStoreSize(), VT.getStoreSize()); + + if (!Offset->isDivergent()) { + SDValue Ops[] = { + Rsrc, + Offset, // Offset + GLC // glc + }; + return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, + DAG.getVTList(VT), Ops, VT, MMO); + } + + // We have a divergent offset. Emit a MUBUF buffer load instead. We can + // assume that the buffer is unswizzled. + SmallVector<SDValue, 4> Loads; + unsigned NumLoads = 1; + MVT LoadVT = VT.getSimpleVT(); + unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; + assert((LoadVT.getScalarType() == MVT::i32 || + LoadVT.getScalarType() == MVT::f32) && + isPowerOf2_32(NumElts)); + + if (NumElts == 8 || NumElts == 16) { + NumLoads = NumElts == 16 ? 4 : 2; + LoadVT = MVT::v4i32; + } + + SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue}); + unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue(); + SDValue Ops[] = { + DAG.getEntryNode(), // Chain + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + {}, // voffset + {}, // soffset + {}, // offset + DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy + DAG.getConstant(0, DL, MVT::i1), // idxen + }; + + // Use the alignment to ensure that the required offsets will fit into the + // immediate offsets. + setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4); + + uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue(); + for (unsigned i = 0; i < NumLoads; ++i) { + Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32); + Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, + Ops, LoadVT, MMO)); + } + + if (VT == MVT::v8i32 || VT == MVT::v16i32) + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads); + + return Loads[0]; +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -4755,14 +5123,14 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (IntrinsicID) { case Intrinsic::amdgcn_implicit_buffer_ptr: { - if (getSubtarget()->isAmdCodeObjectV2(MF.getFunction())) + if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction())) return emitNonHSAIntrinsicError(DAG, DL, VT); return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); } case Intrinsic::amdgcn_dispatch_ptr: case Intrinsic::amdgcn_queue_ptr: { - if (!Subtarget->isAmdCodeObjectV2(MF.getFunction())) { + if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) { DiagnosticInfoUnsupported BadIntrin( MF.getFunction(), "unsupported hsa intrinsic without hsa target", DL.getDebugLoc()); @@ -4880,12 +5248,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::r600_read_tgid_z: return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); - case Intrinsic::amdgcn_workitem_id_x: { + case Intrinsic::amdgcn_workitem_id_x: case Intrinsic::r600_read_tidig_x: return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDX); - } case Intrinsic::amdgcn_workitem_id_y: case Intrinsic::r600_read_tidig_y: return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, @@ -4896,19 +5263,16 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDZ); - case AMDGPUIntrinsic::SI_load_const: { - SDValue Ops[] = { - Op.getOperand(1), - Op.getOperand(2) - }; - - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant, - VT.getStoreSize(), 4); - return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, - Op->getVTList(), Ops, VT, MMO); + case SIIntrinsic::SI_load_const: { + SDValue Load = + lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2), + DAG.getTargetConstant(0, DL, MVT::i1), DAG); + return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load); + } + case Intrinsic::amdgcn_s_buffer_load: { + unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); + return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), + DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG); } case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); @@ -4991,34 +5355,15 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Denominator, Numerator); } case Intrinsic::amdgcn_icmp: { - const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3)); - if (!CD) - return DAG.getUNDEF(VT); - - int CondCode = CD->getSExtValue(); - if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE || - CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE) - return DAG.getUNDEF(VT); - - ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode); - ISD::CondCode CCOpcode = getICmpCondCode(IcInput); - return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1), - Op.getOperand(2), DAG.getCondCode(CCOpcode)); + // There is a Pat that handles this variant, so return it as-is. + if (Op.getOperand(1).getValueType() == MVT::i1 && + Op.getConstantOperandVal(2) == 0 && + Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE) + return Op; + return lowerICMPIntrinsic(*this, Op.getNode(), DAG); } case Intrinsic::amdgcn_fcmp: { - const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3)); - if (!CD) - return DAG.getUNDEF(VT); - - int CondCode = CD->getSExtValue(); - if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE || - CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) - return DAG.getUNDEF(VT); - - FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode); - ISD::CondCode CCOpcode = getFCmpCondCode(IcInput); - return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1), - Op.getOperand(2), DAG.getCondCode(CCOpcode)); + return lowerFCMPIntrinsic(*this, Op.getNode(), DAG); } case Intrinsic::amdgcn_fmed3: return DAG.getNode(AMDGPUISD::FMED3, DL, VT, @@ -5058,6 +5403,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, else Opcode = AMDGPUISD::CVT_PK_U16_U32; + if (isTypeLegal(VT)) + return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2)); + SDValue Node = DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2)); return DAG.getNode(ISD::BITCAST, DL, VT, Node); @@ -5127,36 +5475,104 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, } case Intrinsic::amdgcn_buffer_load: case Intrinsic::amdgcn_buffer_load_format: { + unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); + unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue(); + unsigned IdxEn = 1; + if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3))) + IdxEn = Idx->getZExtValue() != 0; SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc Op.getOperand(3), // vindex - Op.getOperand(4), // offset - Op.getOperand(5), // glc - Op.getOperand(6) // slc + SDValue(), // voffset -- will be set by setBufferOffsets + SDValue(), // soffset -- will be set by setBufferOffsets + SDValue(), // offset -- will be set by setBufferOffsets + DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getConstant(IdxEn, DL, MVT::i1), // idxen }; + setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]); unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; + EVT VT = Op.getValueType(); EVT IntVT = VT.changeTypeToInteger(); auto *M = cast<MemSDNode>(Op); EVT LoadVT = Op.getValueType(); - bool IsD16 = LoadVT.getScalarType() == MVT::f16; - if (IsD16) - return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG); + if (LoadVT.getScalarType() == MVT::f16) + return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, + M, DAG, Ops); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, + M->getMemOperand()); + } + case Intrinsic::amdgcn_raw_buffer_load: + case Intrinsic::amdgcn_raw_buffer_load_format: { + auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(4), // soffset + Offsets.second, // offset + Op.getOperand(5), // cachepolicy + DAG.getConstant(0, DL, MVT::i1), // idxen + }; + + unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ? + AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; + + EVT VT = Op.getValueType(); + EVT IntVT = VT.changeTypeToInteger(); + auto *M = cast<MemSDNode>(Op); + EVT LoadVT = Op.getValueType(); + + if (LoadVT.getScalarType() == MVT::f16) + return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, + M, DAG, Ops); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, + M->getMemOperand()); + } + case Intrinsic::amdgcn_struct_buffer_load: + case Intrinsic::amdgcn_struct_buffer_load_format: { + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // cachepolicy + DAG.getConstant(1, DL, MVT::i1), // idxen + }; + + unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ? + AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; + + EVT VT = Op.getValueType(); + EVT IntVT = VT.changeTypeToInteger(); + auto *M = cast<MemSDNode>(Op); + EVT LoadVT = Op.getValueType(); + + if (LoadVT.getScalarType() == MVT::f16) + return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, + M, DAG, Ops); return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, M->getMemOperand()); } case Intrinsic::amdgcn_tbuffer_load: { MemSDNode *M = cast<MemSDNode>(Op); EVT LoadVT = Op.getValueType(); - bool IsD16 = LoadVT.getScalarType() == MVT::f16; - if (IsD16) { - return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG); - } + unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue(); + unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue(); + unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue(); + unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue(); + unsigned IdxEn = 1; + if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3))) + IdxEn = Idx->getZExtValue() != 0; SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc @@ -5164,12 +5580,62 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(4), // voffset Op.getOperand(5), // soffset Op.getOperand(6), // offset - Op.getOperand(7), // dfmt - Op.getOperand(8), // nfmt - Op.getOperand(9), // glc - Op.getOperand(10) // slc + DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format + DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + }; + + if (LoadVT.getScalarType() == MVT::f16) + return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, + M, DAG, Ops); + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, LoadVT, + M->getMemOperand()); + } + case Intrinsic::amdgcn_raw_tbuffer_load: { + MemSDNode *M = cast<MemSDNode>(Op); + EVT LoadVT = Op.getValueType(); + auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); + + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(4), // soffset + Offsets.second, // offset + Op.getOperand(5), // format + Op.getOperand(6), // cachepolicy + DAG.getConstant(0, DL, MVT::i1), // idxen + }; + + if (LoadVT.getScalarType() == MVT::f16) + return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, + M, DAG, Ops); + return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, LoadVT, + M->getMemOperand()); + } + case Intrinsic::amdgcn_struct_tbuffer_load: { + MemSDNode *M = cast<MemSDNode>(Op); + EVT LoadVT = Op.getValueType(); + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // rsrc + Op.getOperand(3), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // format + Op.getOperand(7), // cachepolicy + DAG.getConstant(1, DL, MVT::i1), // idxen }; + if (LoadVT.getScalarType() == MVT::f16) + return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, + M, DAG, Ops); return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, Op->getVTList(), Ops, LoadVT, M->getMemOperand()); @@ -5184,14 +5650,22 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_buffer_atomic_and: case Intrinsic::amdgcn_buffer_atomic_or: case Intrinsic::amdgcn_buffer_atomic_xor: { + unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue(); + unsigned IdxEn = 1; + if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4))) + IdxEn = Idx->getZExtValue() != 0; SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // vdata Op.getOperand(3), // rsrc Op.getOperand(4), // vindex - Op.getOperand(5), // offset - Op.getOperand(6) // slc + SDValue(), // voffset -- will be set by setBufferOffsets + SDValue(), // soffset -- will be set by setBufferOffsets + SDValue(), // offset -- will be set by setBufferOffsets + DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy + DAG.getConstant(IdxEn, DL, MVT::i1), // idxen }; + setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); @@ -5235,16 +5709,193 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); } + case Intrinsic::amdgcn_raw_buffer_atomic_swap: + case Intrinsic::amdgcn_raw_buffer_atomic_add: + case Intrinsic::amdgcn_raw_buffer_atomic_sub: + case Intrinsic::amdgcn_raw_buffer_atomic_smin: + case Intrinsic::amdgcn_raw_buffer_atomic_umin: + case Intrinsic::amdgcn_raw_buffer_atomic_smax: + case Intrinsic::amdgcn_raw_buffer_atomic_umax: + case Intrinsic::amdgcn_raw_buffer_atomic_and: + case Intrinsic::amdgcn_raw_buffer_atomic_or: + case Intrinsic::amdgcn_raw_buffer_atomic_xor: { + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // vdata + Op.getOperand(3), // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // cachepolicy + DAG.getConstant(0, DL, MVT::i1), // idxen + }; + EVT VT = Op.getValueType(); + + auto *M = cast<MemSDNode>(Op); + unsigned Opcode = 0; + + switch (IntrID) { + case Intrinsic::amdgcn_raw_buffer_atomic_swap: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_add: + Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_sub: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_smin: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_umin: + Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_smax: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_umax: + Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_and: + Opcode = AMDGPUISD::BUFFER_ATOMIC_AND; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_or: + Opcode = AMDGPUISD::BUFFER_ATOMIC_OR; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_xor: + Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; + break; + default: + llvm_unreachable("unhandled atomic opcode"); + } + + return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, + M->getMemOperand()); + } + case Intrinsic::amdgcn_struct_buffer_atomic_swap: + case Intrinsic::amdgcn_struct_buffer_atomic_add: + case Intrinsic::amdgcn_struct_buffer_atomic_sub: + case Intrinsic::amdgcn_struct_buffer_atomic_smin: + case Intrinsic::amdgcn_struct_buffer_atomic_umin: + case Intrinsic::amdgcn_struct_buffer_atomic_smax: + case Intrinsic::amdgcn_struct_buffer_atomic_umax: + case Intrinsic::amdgcn_struct_buffer_atomic_and: + case Intrinsic::amdgcn_struct_buffer_atomic_or: + case Intrinsic::amdgcn_struct_buffer_atomic_xor: { + auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + Offsets.first, // voffset + Op.getOperand(6), // soffset + Offsets.second, // offset + Op.getOperand(7), // cachepolicy + DAG.getConstant(1, DL, MVT::i1), // idxen + }; + EVT VT = Op.getValueType(); + + auto *M = cast<MemSDNode>(Op); + unsigned Opcode = 0; + + switch (IntrID) { + case Intrinsic::amdgcn_struct_buffer_atomic_swap: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_add: + Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_sub: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_smin: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_umin: + Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_smax: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_umax: + Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_and: + Opcode = AMDGPUISD::BUFFER_ATOMIC_AND; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_or: + Opcode = AMDGPUISD::BUFFER_ATOMIC_OR; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_xor: + Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; + break; + default: + llvm_unreachable("unhandled atomic opcode"); + } + return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, + M->getMemOperand()); + } case Intrinsic::amdgcn_buffer_atomic_cmpswap: { + unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue(); + unsigned IdxEn = 1; + if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5))) + IdxEn = Idx->getZExtValue() != 0; + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // src + Op.getOperand(3), // cmp + Op.getOperand(4), // rsrc + Op.getOperand(5), // vindex + SDValue(), // voffset -- will be set by setBufferOffsets + SDValue(), // soffset -- will be set by setBufferOffsets + SDValue(), // offset -- will be set by setBufferOffsets + DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy + DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + }; + setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]); + EVT VT = Op.getValueType(); + auto *M = cast<MemSDNode>(Op); + + return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, + Op->getVTList(), Ops, VT, M->getMemOperand()); + } + case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: { + auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // src + Op.getOperand(3), // cmp + Op.getOperand(4), // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(6), // soffset + Offsets.second, // offset + Op.getOperand(7), // cachepolicy + DAG.getConstant(0, DL, MVT::i1), // idxen + }; + EVT VT = Op.getValueType(); + auto *M = cast<MemSDNode>(Op); + + return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, + Op->getVTList(), Ops, VT, M->getMemOperand()); + } + case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: { + auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // src Op.getOperand(3), // cmp Op.getOperand(4), // rsrc Op.getOperand(5), // vindex - Op.getOperand(6), // offset - Op.getOperand(7) // slc + Offsets.first, // voffset + Op.getOperand(7), // soffset + Offsets.second, // offset + Op.getOperand(8), // cachepolicy + DAG.getConstant(1, DL, MVT::i1), // idxen }; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); @@ -5360,19 +6011,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain, Op.getOperand(2), Op.getOperand(3)); } - case AMDGPUIntrinsic::AMDGPU_kill: { - SDValue Src = Op.getOperand(2); - if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) { - if (!K->isNegative()) - return Chain; - - SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32); - return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne); - } - - SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src); - return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast); - } case Intrinsic::amdgcn_s_barrier: { if (getTargetMachine().getOptLevel() > CodeGenOpt::None) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); @@ -5383,69 +6021,79 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, } return SDValue(); }; - case AMDGPUIntrinsic::SI_tbuffer_store: { - - // Extract vindex and voffset from vaddr as appropriate - const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10)); - const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11)); - SDValue VAddr = Op.getOperand(5); - - SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); - - assert(!(OffEn->isOne() && IdxEn->isOne()) && - "Legacy intrinsic doesn't support both offset and index - use new version"); - - SDValue VIndex = IdxEn->isOne() ? VAddr : Zero; - SDValue VOffset = OffEn->isOne() ? VAddr : Zero; - - // Deal with the vec-3 case - const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4)); - auto Opcode = NumChannels->getZExtValue() == 3 ? - AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT; - + case Intrinsic::amdgcn_tbuffer_store: { + SDValue VData = Op.getOperand(2); + bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + if (IsD16) + VData = handleD16VData(VData, DAG); + unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue(); + unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue(); + unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue(); + unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue(); + unsigned IdxEn = 1; + if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4))) + IdxEn = Idx->getZExtValue() != 0; SDValue Ops[] = { - Chain, - Op.getOperand(3), // vdata - Op.getOperand(2), // rsrc - VIndex, - VOffset, - Op.getOperand(6), // soffset - Op.getOperand(7), // inst_offset - Op.getOperand(8), // dfmt - Op.getOperand(9), // nfmt - Op.getOperand(12), // glc - Op.getOperand(13), // slc + Chain, + VData, // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + Op.getOperand(5), // voffset + Op.getOperand(6), // soffset + Op.getOperand(7), // offset + DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format + DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getConstant(IdxEn, DL, MVT::i1), // idexen }; - - assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 && - "Value of tfe other than zero is unsupported"); - - EVT VT = Op.getOperand(3).getValueType(); - MachineMemOperand *MMO = MF.getMachineMemOperand( - MachinePointerInfo(), - MachineMemOperand::MOStore, - VT.getStoreSize(), 4); - return DAG.getMemIntrinsicNode(Opcode, DL, - Op->getVTList(), Ops, VT, MMO); + unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : + AMDGPUISD::TBUFFER_STORE_FORMAT; + MemSDNode *M = cast<MemSDNode>(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); } - case Intrinsic::amdgcn_tbuffer_store: { + case Intrinsic::amdgcn_struct_tbuffer_store: { SDValue VData = Op.getOperand(2); bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); if (IsD16) VData = handleD16VData(VData, DAG); + auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); SDValue Ops[] = { Chain, VData, // vdata Op.getOperand(3), // rsrc Op.getOperand(4), // vindex - Op.getOperand(5), // voffset + Offsets.first, // voffset Op.getOperand(6), // soffset - Op.getOperand(7), // offset - Op.getOperand(8), // dfmt - Op.getOperand(9), // nfmt - Op.getOperand(10), // glc - Op.getOperand(11) // slc + Offsets.second, // offset + Op.getOperand(7), // format + Op.getOperand(8), // cachepolicy + DAG.getConstant(1, DL, MVT::i1), // idexen + }; + unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : + AMDGPUISD::TBUFFER_STORE_FORMAT; + MemSDNode *M = cast<MemSDNode>(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } + + case Intrinsic::amdgcn_raw_tbuffer_store: { + SDValue VData = Op.getOperand(2); + bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + if (IsD16) + VData = handleD16VData(VData, DAG); + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + SDValue Ops[] = { + Chain, + VData, // vdata + Op.getOperand(3), // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // format + Op.getOperand(7), // cachepolicy + DAG.getConstant(0, DL, MVT::i1), // idexen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -5460,15 +6108,23 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); if (IsD16) VData = handleD16VData(VData, DAG); + unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue(); + unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue(); + unsigned IdxEn = 1; + if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4))) + IdxEn = Idx->getZExtValue() != 0; SDValue Ops[] = { Chain, - VData, // vdata + VData, Op.getOperand(3), // rsrc Op.getOperand(4), // vindex - Op.getOperand(5), // offset - Op.getOperand(6), // glc - Op.getOperand(7) // slc + SDValue(), // voffset -- will be set by setBufferOffsets + SDValue(), // soffset -- will be set by setBufferOffsets + SDValue(), // offset -- will be set by setBufferOffsets + DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getConstant(IdxEn, DL, MVT::i1), // idxen }; + setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; @@ -5476,6 +6132,59 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } + + case Intrinsic::amdgcn_raw_buffer_store: + case Intrinsic::amdgcn_raw_buffer_store_format: { + SDValue VData = Op.getOperand(2); + bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + if (IsD16) + VData = handleD16VData(VData, DAG); + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + SDValue Ops[] = { + Chain, + VData, + Op.getOperand(3), // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // cachepolicy + DAG.getConstant(0, DL, MVT::i1), // idxen + }; + unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ? + AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; + Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; + MemSDNode *M = cast<MemSDNode>(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } + + case Intrinsic::amdgcn_struct_buffer_store: + case Intrinsic::amdgcn_struct_buffer_store_format: { + SDValue VData = Op.getOperand(2); + bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + if (IsD16) + VData = handleD16VData(VData, DAG); + auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); + SDValue Ops[] = { + Chain, + VData, + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + Offsets.first, // voffset + Op.getOperand(6), // soffset + Offsets.second, // offset + Op.getOperand(7), // cachepolicy + DAG.getConstant(1, DL, MVT::i1), // idxen + }; + unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ? + AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; + Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; + MemSDNode *M = cast<MemSDNode>(Op); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } + default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -5486,6 +6195,94 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, } } +// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: +// offset (the offset that is included in bounds checking and swizzling, to be +// split between the instruction's voffset and immoffset fields) and soffset +// (the offset that is excluded from bounds checking and swizzling, to go in +// the instruction's soffset field). This function takes the first kind of +// offset and figures out how to split it between voffset and immoffset. +std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( + SDValue Offset, SelectionDAG &DAG) const { + SDLoc DL(Offset); + const unsigned MaxImm = 4095; + SDValue N0 = Offset; + ConstantSDNode *C1 = nullptr; + + if ((C1 = dyn_cast<ConstantSDNode>(N0))) + N0 = SDValue(); + else if (DAG.isBaseWithConstantOffset(N0)) { + C1 = cast<ConstantSDNode>(N0.getOperand(1)); + N0 = N0.getOperand(0); + } + + if (C1) { + unsigned ImmOffset = C1->getZExtValue(); + // If the immediate value is too big for the immoffset field, put the value + // and -4096 into the immoffset field so that the value that is copied/added + // for the voffset field is a multiple of 4096, and it stands more chance + // of being CSEd with the copy/add for another similar load/store. + // However, do not do that rounding down to a multiple of 4096 if that is a + // negative number, as it appears to be illegal to have a negative offset + // in the vgpr, even if adding the immediate offset makes it positive. + unsigned Overflow = ImmOffset & ~MaxImm; + ImmOffset -= Overflow; + if ((int32_t)Overflow < 0) { + Overflow += ImmOffset; + ImmOffset = 0; + } + C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32)); + if (Overflow) { + auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32); + if (!N0) + N0 = OverflowVal; + else { + SDValue Ops[] = { N0, OverflowVal }; + N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops); + } + } + } + if (!N0) + N0 = DAG.getConstant(0, DL, MVT::i32); + if (!C1) + C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32)); + return {N0, SDValue(C1, 0)}; +} + +// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the +// three offsets (voffset, soffset and instoffset) into the SDValue[3] array +// pointed to by Offsets. +void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, + SelectionDAG &DAG, SDValue *Offsets, + unsigned Align) const { + SDLoc DL(CombinedOffset); + if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) { + uint32_t Imm = C->getZExtValue(); + uint32_t SOffset, ImmOffset; + if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) { + Offsets[0] = DAG.getConstant(0, DL, MVT::i32); + Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); + Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); + return; + } + } + if (DAG.isBaseWithConstantOffset(CombinedOffset)) { + SDValue N0 = CombinedOffset.getOperand(0); + SDValue N1 = CombinedOffset.getOperand(1); + uint32_t SOffset, ImmOffset; + int Offset = cast<ConstantSDNode>(N1)->getSExtValue(); + if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, + Subtarget, Align)) { + Offsets[0] = N0; + Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); + Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); + return; + } + } + Offsets[0] = CombinedOffset; + Offsets[1] = DAG.getConstant(0, DL, MVT::i32); + Offsets[2] = DAG.getConstant(0, DL, MVT::i32); +} + static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT) { @@ -5513,8 +6310,8 @@ SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const // FIXME: Constant loads should all be marked invariant. unsigned AS = Ld->getAddressSpace(); - if (AS != AMDGPUASI.CONSTANT_ADDRESS && - AS != AMDGPUASI.CONSTANT_ADDRESS_32BIT && + if (AS != AMDGPUAS::CONSTANT_ADDRESS && + AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT && (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant())) return SDValue(); @@ -5625,15 +6422,15 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // If there is a possibilty that flat instruction access scratch memory // then we need to use the same legalization rules we use for private. - if (AS == AMDGPUASI.FLAT_ADDRESS) + if (AS == AMDGPUAS::FLAT_ADDRESS) AS = MFI->hasFlatScratchInit() ? - AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS; + AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; unsigned NumElements = MemVT.getVectorNumElements(); - if (AS == AMDGPUASI.CONSTANT_ADDRESS || - AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) { - if (!Op->isDivergent() && Alignment >= 4) + if (AS == AMDGPUAS::CONSTANT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private @@ -5641,28 +6438,28 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { // } - if (AS == AMDGPUASI.CONSTANT_ADDRESS || - AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT || - AS == AMDGPUASI.GLOBAL_ADDRESS) { + if (AS == AMDGPUAS::CONSTANT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || + AS == AMDGPUAS::GLOBAL_ADDRESS) { if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) && - Alignment >= 4) + Alignment >= 4 && NumElements < 32) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private // loads. // } - if (AS == AMDGPUASI.CONSTANT_ADDRESS || - AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT || - AS == AMDGPUASI.GLOBAL_ADDRESS || - AS == AMDGPUASI.FLAT_ADDRESS) { + if (AS == AMDGPUAS::CONSTANT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || + AS == AMDGPUAS::GLOBAL_ADDRESS || + AS == AMDGPUAS::FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorLoad(Op, DAG); // v4 loads are supported for private and global memory. return SDValue(); } - if (AS == AMDGPUASI.PRIVATE_ADDRESS) { + if (AS == AMDGPUAS::PRIVATE_ADDRESS) { // Depending on the setting of the private_element_size field in the // resource descriptor, we can only make private accesses up to a certain // size. @@ -5681,7 +6478,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { default: llvm_unreachable("unsupported private_element_size"); } - } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { // Use ds_read_b128 if possible. if (Subtarget->useDS128() && Load->getAlignment() >= 16 && MemVT.getStoreSize() == 16) @@ -5689,6 +6486,17 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (NumElements > 2) return SplitVectorLoad(Op, DAG); + + // SI has a hardware bug in the LDS / GDS boounds checking: if the base + // address is negative, then the instruction is incorrectly treated as + // out-of-bounds even if base + offsets is in bounds. Split vectorized + // loads here to avoid emitting ds_read2_b32. We may re-combine the + // load later in the SILoadStoreOptimizer. + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && + NumElements == 2 && MemVT.getStoreSize() == 8 && + Load->getAlignment() < 8) { + return SplitVectorLoad(Op, DAG); + } } return SDValue(); } @@ -6058,17 +6866,17 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // If there is a possibilty that flat instruction access scratch memory // then we need to use the same legalization rules we use for private. - if (AS == AMDGPUASI.FLAT_ADDRESS) + if (AS == AMDGPUAS::FLAT_ADDRESS) AS = MFI->hasFlatScratchInit() ? - AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS; + AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; unsigned NumElements = VT.getVectorNumElements(); - if (AS == AMDGPUASI.GLOBAL_ADDRESS || - AS == AMDGPUASI.FLAT_ADDRESS) { + if (AS == AMDGPUAS::GLOBAL_ADDRESS || + AS == AMDGPUAS::FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorStore(Op, DAG); return SDValue(); - } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) { + } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { switch (Subtarget->getMaxPrivateElementSize()) { case 4: return scalarizeVectorStore(Store, DAG); @@ -6083,7 +6891,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { default: llvm_unreachable("unsupported private_element_size"); } - } else if (AS == AMDGPUASI.LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { // Use ds_write_b128 if possible. if (Subtarget->useDS128() && Store->getAlignment() >= 16 && VT.getStoreSize() == 16) @@ -6091,6 +6899,18 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if (NumElements > 2) return SplitVectorStore(Op, DAG); + + // SI has a hardware bug in the LDS / GDS boounds checking: if the base + // address is negative, then the instruction is incorrectly treated as + // out-of-bounds even if base + offsets is in bounds. Split vectorized + // stores here to avoid emitting ds_write2_b32. We may re-combine the + // store later in the SILoadStoreOptimizer. + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && + NumElements == 2 && VT.getStoreSize() == 8 && + Store->getAlignment() < 8) { + return SplitVectorStore(Op, DAG); + } + return SDValue(); } else { llvm_unreachable("unhandled address space"); @@ -6101,17 +6921,24 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue Arg = Op.getOperand(0); + SDValue TrigVal; + // TODO: Should this propagate fast-math-flags? - SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, - DAG.getNode(ISD::FMUL, DL, VT, Arg, - DAG.getConstantFP(0.5/M_PI, DL, - VT))); + + SDValue OneOver2Pi = DAG.getConstantFP(0.5 / M_PI, DL, VT); + + if (Subtarget->hasTrigReducedRange()) { + SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi); + TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal); + } else { + TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi); + } switch (Op.getOpcode()) { case ISD::FCOS: - return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); + return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal); case ISD::FSIN: - return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); + return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal); default: llvm_unreachable("Wrong trig opcode"); } @@ -6123,7 +6950,7 @@ SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) co unsigned AS = AtomicNode->getAddressSpace(); // No custom lowering required for local address space - if (!isFlatGlobalAddrSpace(AS, AMDGPUASI)) + if (!isFlatGlobalAddrSpace(AS)) return Op; // Non-local address space requires custom lowering for atomic compare @@ -6475,6 +7302,29 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, } } + if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS) + std::swap(LHS, RHS); + + if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS && + RHS.hasOneUse()) { + ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); + // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan) + // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan) + const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1)); + if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask && + (RHS.getOperand(0) == LHS.getOperand(0) && + LHS.getOperand(0) == LHS.getOperand(1))) { + const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN; + unsigned NewMask = LCC == ISD::SETO ? + Mask->getZExtValue() & ~OrdMask : + Mask->getZExtValue() & OrdMask; + + SDLoc DL(N); + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0), + DAG.getConstant(NewMask, DL, MVT::i32)); + } + } + if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) { // and x, (sext cc from i1) => select cc, x, 0 @@ -6798,158 +7648,294 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N, return AMDGPUTargetLowering::performRcpCombine(N, DCI); } -static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { - if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) +bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, + unsigned MaxDepth) const { + unsigned Opcode = Op.getOpcode(); + if (Opcode == ISD::FCANONICALIZE) return true; - return DAG.isKnownNeverNaN(Op); -} + if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) { + auto F = CFP->getValueAPF(); + if (F.isNaN() && F.isSignaling()) + return false; + return !F.isDenormal() || denormalsEnabledForType(Op.getValueType()); + } -static bool isCanonicalized(SelectionDAG &DAG, SDValue Op, - const GCNSubtarget *ST, unsigned MaxDepth=5) { // If source is a result of another standard FP operation it is already in // canonical form. + if (MaxDepth == 0) + return false; - switch (Op.getOpcode()) { - default: - break; - + switch (Opcode) { // These will flush denorms if required. case ISD::FADD: case ISD::FSUB: case ISD::FMUL: - case ISD::FSQRT: case ISD::FCEIL: case ISD::FFLOOR: case ISD::FMA: case ISD::FMAD: - - case ISD::FCANONICALIZE: - return true; - + case ISD::FSQRT: + case ISD::FDIV: + case ISD::FREM: case ISD::FP_ROUND: - return Op.getValueType().getScalarType() != MVT::f16 || - ST->hasFP16Denormals(); - case ISD::FP_EXTEND: - return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 || - ST->hasFP16Denormals(); + case AMDGPUISD::FMUL_LEGACY: + case AMDGPUISD::FMAD_FTZ: + case AMDGPUISD::RCP: + case AMDGPUISD::RSQ: + case AMDGPUISD::RSQ_CLAMP: + case AMDGPUISD::RCP_LEGACY: + case AMDGPUISD::RSQ_LEGACY: + case AMDGPUISD::RCP_IFLAG: + case AMDGPUISD::TRIG_PREOP: + case AMDGPUISD::DIV_SCALE: + case AMDGPUISD::DIV_FMAS: + case AMDGPUISD::DIV_FIXUP: + case AMDGPUISD::FRACT: + case AMDGPUISD::LDEXP: + case AMDGPUISD::CVT_PKRTZ_F16_F32: + case AMDGPUISD::CVT_F32_UBYTE0: + case AMDGPUISD::CVT_F32_UBYTE1: + case AMDGPUISD::CVT_F32_UBYTE2: + case AMDGPUISD::CVT_F32_UBYTE3: + return true; // It can/will be lowered or combined as a bit operation. // Need to check their input recursively to handle. case ISD::FNEG: case ISD::FABS: - return (MaxDepth > 0) && - isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1); + case ISD::FCOPYSIGN: + return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); case ISD::FSIN: case ISD::FCOS: case ISD::FSINCOS: return Op.getValueType().getScalarType() != MVT::f16; - // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. - // For such targets need to check their input recursively. case ISD::FMINNUM: case ISD::FMAXNUM: - case ISD::FMINNAN: - case ISD::FMAXNAN: + case ISD::FMINNUM_IEEE: + case ISD::FMAXNUM_IEEE: + case AMDGPUISD::CLAMP: + case AMDGPUISD::FMED3: + case AMDGPUISD::FMAX3: + case AMDGPUISD::FMIN3: { + // FIXME: Shouldn't treat the generic operations different based these. + // However, we aren't really required to flush the result from + // minnum/maxnum.. - if (ST->supportsMinMaxDenormModes() && - DAG.isKnownNeverNaN(Op.getOperand(0)) && - DAG.isKnownNeverNaN(Op.getOperand(1))) + // snans will be quieted, so we only need to worry about denormals. + if (Subtarget->supportsMinMaxDenormModes() || + denormalsEnabledForType(Op.getValueType())) return true; - return (MaxDepth > 0) && - isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) && - isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1); + // Flushing may be required. + // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such + // targets need to check their input recursively. + + // FIXME: Does this apply with clamp? It's implemented with max. + for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) { + if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1)) + return false; + } - case ISD::ConstantFP: { - auto F = cast<ConstantFPSDNode>(Op)->getValueAPF(); - return !F.isDenormal() && !(F.isNaN() && F.isSignaling()); + return true; } + case ISD::SELECT: { + return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) && + isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1); } - return false; + case ISD::BUILD_VECTOR: { + for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { + SDValue SrcOp = Op.getOperand(i); + if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1)) + return false; + } + + return true; + } + case ISD::EXTRACT_VECTOR_ELT: + case ISD::EXTRACT_SUBVECTOR: { + return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); + } + case ISD::INSERT_VECTOR_ELT: { + return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) && + isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1); + } + case ISD::UNDEF: + // Could be anything. + return false; + + case ISD::BITCAST: { + // Hack round the mess we make when legalizing extract_vector_elt + SDValue Src = Op.getOperand(0); + if (Src.getValueType() == MVT::i16 && + Src.getOpcode() == ISD::TRUNCATE) { + SDValue TruncSrc = Src.getOperand(0); + if (TruncSrc.getValueType() == MVT::i32 && + TruncSrc.getOpcode() == ISD::BITCAST && + TruncSrc.getOperand(0).getValueType() == MVT::v2f16) { + return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1); + } + } + + return false; + } + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntrinsicID + = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + // TODO: Handle more intrinsics + switch (IntrinsicID) { + case Intrinsic::amdgcn_cvt_pkrtz: + case Intrinsic::amdgcn_cubeid: + case Intrinsic::amdgcn_frexp_mant: + case Intrinsic::amdgcn_fdot2: + return true; + default: + break; + } + + LLVM_FALLTHROUGH; + } + default: + return denormalsEnabledForType(Op.getValueType()) && + DAG.isKnownNeverSNaN(Op); + } + + llvm_unreachable("invalid operation"); } // Constant fold canonicalize. +SDValue SITargetLowering::getCanonicalConstantFP( + SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const { + // Flush denormals to 0 if not enabled. + if (C.isDenormal() && !denormalsEnabledForType(VT)) + return DAG.getConstantFP(0.0, SL, VT); + + if (C.isNaN()) { + APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics()); + if (C.isSignaling()) { + // Quiet a signaling NaN. + // FIXME: Is this supposed to preserve payload bits? + return DAG.getConstantFP(CanonicalQNaN, SL, VT); + } + + // Make sure it is the canonical NaN bitpattern. + // + // TODO: Can we use -1 as the canonical NaN value since it's an inline + // immediate? + if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt()) + return DAG.getConstantFP(CanonicalQNaN, SL, VT); + } + + // Already canonical. + return DAG.getConstantFP(C, SL, VT); +} + +static bool vectorEltWillFoldAway(SDValue Op) { + return Op.isUndef() || isa<ConstantFPSDNode>(Op); +} + SDValue SITargetLowering::performFCanonicalizeCombine( SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); // fcanonicalize undef -> qnan if (N0.isUndef()) { - EVT VT = N->getValueType(0); APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT)); return DAG.getConstantFP(QNaN, SDLoc(N), VT); } - ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0); - if (!CFP) { - SDValue N0 = N->getOperand(0); - EVT VT = N0.getValueType().getScalarType(); - auto ST = getSubtarget(); + if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) { + EVT VT = N->getValueType(0); + return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF()); + } - if (((VT == MVT::f32 && ST->hasFP32Denormals()) || - (VT == MVT::f64 && ST->hasFP64Denormals()) || - (VT == MVT::f16 && ST->hasFP16Denormals())) && - DAG.isKnownNeverNaN(N0)) - return N0; + // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x), + // (fcanonicalize k) + // + // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0 - bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); + // TODO: This could be better with wider vectors that will be split to v2f16, + // and to consider uses since there aren't that many packed operations. + if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 && + isTypeLegal(MVT::v2f16)) { + SDLoc SL(N); + SDValue NewElts[2]; + SDValue Lo = N0.getOperand(0); + SDValue Hi = N0.getOperand(1); + EVT EltVT = Lo.getValueType(); - if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) && - isCanonicalized(DAG, N0, ST)) - return N0; + if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) { + for (unsigned I = 0; I != 2; ++I) { + SDValue Op = N0.getOperand(I); + if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) { + NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT, + CFP->getValueAPF()); + } else if (Op.isUndef()) { + // Handled below based on what the other operand is. + NewElts[I] = Op; + } else { + NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op); + } + } - return SDValue(); - } + // If one half is undef, and one is constant, perfer a splat vector rather + // than the normal qNaN. If it's a register, prefer 0.0 since that's + // cheaper to use and may be free with a packed operation. + if (NewElts[0].isUndef()) { + if (isa<ConstantFPSDNode>(NewElts[1])) + NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ? + NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT); + } - const APFloat &C = CFP->getValueAPF(); + if (NewElts[1].isUndef()) { + NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ? + NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT); + } - // Flush denormals to 0 if not enabled. - if (C.isDenormal()) { - EVT VT = N->getValueType(0); - EVT SVT = VT.getScalarType(); - if (SVT == MVT::f32 && !Subtarget->hasFP32Denormals()) - return DAG.getConstantFP(0.0, SDLoc(N), VT); + return DAG.getBuildVector(VT, SL, NewElts); + } + } - if (SVT == MVT::f64 && !Subtarget->hasFP64Denormals()) - return DAG.getConstantFP(0.0, SDLoc(N), VT); + unsigned SrcOpc = N0.getOpcode(); - if (SVT == MVT::f16 && !Subtarget->hasFP16Denormals()) - return DAG.getConstantFP(0.0, SDLoc(N), VT); - } + // If it's free to do so, push canonicalizes further up the source, which may + // find a canonical source. + // + // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for + // sNaNs. + if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) { + auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1)); + if (CRHS && N0.hasOneUse()) { + SDLoc SL(N); + SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT, + N0.getOperand(0)); + SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF()); + DCI.AddToWorklist(Canon0.getNode()); - if (C.isNaN()) { - EVT VT = N->getValueType(0); - APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics()); - if (C.isSignaling()) { - // Quiet a signaling NaN. - return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); + return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1); } - - // Make sure it is the canonical NaN bitpattern. - // - // TODO: Can we use -1 as the canonical NaN value since it's an inline - // immediate? - if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt()) - return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); } - return N0; + return isCanonicalized(DAG, N0) ? N0 : SDValue(); } static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { switch (Opc) { case ISD::FMAXNUM: + case ISD::FMAXNUM_IEEE: return AMDGPUISD::FMAX3; case ISD::SMAX: return AMDGPUISD::SMAX3; case ISD::UMAX: return AMDGPUISD::UMAX3; case ISD::FMINNUM: + case ISD::FMINNUM_IEEE: return AMDGPUISD::FMIN3; case ISD::SMIN: return AMDGPUISD::SMIN3; @@ -7044,11 +8030,18 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, // then give the other result, which is different from med3 with a NaN // input. SDValue Var = Op0.getOperand(0); - if (!isKnownNeverSNan(DAG, Var)) + if (!DAG.isKnownNeverSNaN(Var)) return SDValue(); - return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), - Var, SDValue(K0, 0), SDValue(K1, 0)); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + + if ((!K0->hasOneUse() || + TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) && + (!K1->hasOneUse() || + TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) { + return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), + Var, SDValue(K0, 0), SDValue(K1, 0)); + } } return SDValue(); @@ -7109,6 +8102,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || + (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) || (Opc == AMDGPUISD::FMIN_LEGACY && Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && (VT == MVT::f32 || VT == MVT::f64 || @@ -7216,9 +8210,11 @@ SDValue SITargetLowering::performExtractVectorEltCombine( switch(Opc) { default: - return SDValue(); + break; // TODO: Support other binary operations. case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: case ISD::ADD: case ISD::UMIN: case ISD::UMAX: @@ -7226,25 +8222,54 @@ SDValue SITargetLowering::performExtractVectorEltCombine( case ISD::SMAX: case ISD::FMAXNUM: case ISD::FMINNUM: - return DAG.getNode(Opc, SL, EltVT, - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, - Vec.getOperand(0), Idx), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, - Vec.getOperand(1), Idx)); + case ISD::FMAXNUM_IEEE: + case ISD::FMINNUM_IEEE: { + SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec.getOperand(0), Idx); + SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec.getOperand(1), Idx); + + DCI.AddToWorklist(Elt0.getNode()); + DCI.AddToWorklist(Elt1.getNode()); + return DAG.getNode(Opc, SL, EltVT, Elt0, Elt1, Vec->getFlags()); + } } } - if (!DCI.isBeforeLegalize()) - return SDValue(); - unsigned VecSize = VecVT.getSizeInBits(); unsigned EltSize = EltVT.getSizeInBits(); + // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx) + // This elminates non-constant index and subsequent movrel or scratch access. + // Sub-dword vectors of size 2 dword or less have better implementation. + // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32 + // instructions. + if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) && + !isa<ConstantSDNode>(N->getOperand(1))) { + SDLoc SL(N); + SDValue Idx = N->getOperand(1); + EVT IdxVT = Idx.getValueType(); + SDValue V; + for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { + SDValue IC = DAG.getConstant(I, SL, IdxVT); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC); + if (I == 0) + V = Elt; + else + V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ); + } + return V; + } + + if (!DCI.isBeforeLegalize()) + return SDValue(); + // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit // elements. This exposes more load reduction opportunities by replacing // multiple small extract_vector_elements with a single 32-bit extract. auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1)); - if (EltSize <= 16 && + if (isa<MemSDNode>(Vec) && + EltSize <= 16 && EltVT.isByteSized() && VecSize > 32 && VecSize % 32 == 0 && @@ -7274,46 +8299,40 @@ SDValue SITargetLowering::performExtractVectorEltCombine( return SDValue(); } -static bool convertBuildVectorCastElt(SelectionDAG &DAG, - SDValue &Lo, SDValue &Hi) { - if (Hi.getOpcode() == ISD::BITCAST && - Hi.getOperand(0).getValueType() == MVT::f16 && - (isa<ConstantSDNode>(Lo) || Lo.isUndef())) { - Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo); - Hi = Hi.getOperand(0); - return true; - } - - return false; -} - -SDValue SITargetLowering::performBuildVectorCombine( - SDNode *N, DAGCombinerInfo &DCI) const { - SDLoc SL(N); +SDValue +SITargetLowering::performInsertVectorEltCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SDValue Vec = N->getOperand(0); + SDValue Idx = N->getOperand(2); + EVT VecVT = Vec.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + unsigned VecSize = VecVT.getSizeInBits(); + unsigned EltSize = EltVT.getSizeInBits(); - if (!isTypeLegal(MVT::v2i16)) + // INSERT_VECTOR_ELT (<n x e>, var-idx) + // => BUILD_VECTOR n x select (e, const-idx) + // This elminates non-constant index and subsequent movrel or scratch access. + // Sub-dword vectors of size 2 dword or less have better implementation. + // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32 + // instructions. + if (isa<ConstantSDNode>(Idx) || + VecSize > 256 || (VecSize <= 64 && EltSize < 32)) return SDValue(); - SelectionDAG &DAG = DCI.DAG; - EVT VT = N->getValueType(0); - - if (VT == MVT::v2i16) { - SDValue Lo = N->getOperand(0); - SDValue Hi = N->getOperand(1); - // v2i16 build_vector (const|undef), (bitcast f16:$x) - // -> bitcast (v2f16 build_vector const|undef, $x - if (convertBuildVectorCastElt(DAG, Lo, Hi)) { - SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi }); - return DAG.getNode(ISD::BITCAST, SL, VT, NewVec); - } + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + SDValue Ins = N->getOperand(1); + EVT IdxVT = Idx.getValueType(); - if (convertBuildVectorCastElt(DAG, Hi, Lo)) { - SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo }); - return DAG.getNode(ISD::BITCAST, SL, VT, NewVec); - } + SmallVector<SDValue, 16> Ops; + for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { + SDValue IC = DAG.getConstant(I, SL, IdxVT); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC); + SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ); + Ops.push_back(V); } - return SDValue(); + return DAG.getBuildVector(VecVT, SL, Ops); } unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, @@ -7568,7 +8587,7 @@ SDValue SITargetLowering::performFMACombine(SDNode *N, EVT VT = N->getValueType(0); SDLoc SL(N); - if (!Subtarget->hasDLInsts() || VT != MVT::f32) + if (!Subtarget->hasDotInsts() || VT != MVT::f32) return SDValue(); // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) -> @@ -7705,16 +8724,26 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, VT != MVT::f16)) return SDValue(); - // Match isinf pattern + // Match isinf/isfinite pattern // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) - if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) { + // (fcmp one (fabs x), inf) -> (fp_class x, + // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero) + if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) { const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS); if (!CRHS) return SDValue(); const APFloat &APF = CRHS->getValueAPF(); if (APF.isInfinity() && !APF.isNegative()) { - unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY; + const unsigned IsInfMask = SIInstrFlags::P_INFINITY | + SIInstrFlags::N_INFINITY; + const unsigned IsFiniteMask = SIInstrFlags::N_ZERO | + SIInstrFlags::P_ZERO | + SIInstrFlags::N_NORMAL | + SIInstrFlags::P_NORMAL | + SIInstrFlags::N_SUBNORMAL | + SIInstrFlags::P_SUBNORMAL; + unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask; return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0), DAG.getConstant(Mask, SL, MVT::i32)); } @@ -7759,8 +8788,7 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N, TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) || - TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) { + if (TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) { DCI.CommitTargetLoweringOpt(TLO); } @@ -7792,6 +8820,9 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { + if (getTargetMachine().getOptLevel() == CodeGenOpt::None) + return SDValue(); + switch (N->getOpcode()) { default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); @@ -7810,17 +8841,15 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performSetCCCombine(N, DCI); case ISD::FMAXNUM: case ISD::FMINNUM: + case ISD::FMAXNUM_IEEE: + case ISD::FMINNUM_IEEE: case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: case ISD::UMIN: case AMDGPUISD::FMIN_LEGACY: - case AMDGPUISD::FMAX_LEGACY: { - if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && - getTargetMachine().getOptLevel() > CodeGenOpt::None) - return performMinMaxCombine(N, DCI); - break; - } + case AMDGPUISD::FMAX_LEGACY: + return performMinMaxCombine(N, DCI); case ISD::FMA: return performFMACombine(N, DCI); case ISD::LOAD: { @@ -7912,8 +8941,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, } case ISD::EXTRACT_VECTOR_ELT: return performExtractVectorEltCombine(N, DCI); - case ISD::BUILD_VECTOR: - return performBuildVectorCombine(N, DCI); + case ISD::INSERT_VECTOR_ELT: + return performInsertVectorEltCombine(N, DCI); } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } @@ -7926,6 +8955,7 @@ static unsigned SubIdx2Lane(unsigned Idx) { case AMDGPU::sub1: return 1; case AMDGPU::sub2: return 2; case AMDGPU::sub3: return 3; + case AMDGPU::sub4: return 4; // Possible with TFE/LWE } } @@ -7939,11 +8969,16 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx)) return Node; // not implemented for D16 - SDNode *Users[4] = { nullptr }; + SDNode *Users[5] = { nullptr }; unsigned Lane = 0; unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1; unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); unsigned NewDmask = 0; + unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1; + unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1; + bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) || + Node->getConstantOperandVal(LWEIdx)) ? 1 : 0; + unsigned TFCLane = 0; bool HasChain = Node->getNumValues() > 1; if (OldDmask == 0) { @@ -7951,6 +8986,12 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, return Node; } + unsigned OldBitsSet = countPopulation(OldDmask); + // Work out which is the TFE/LWE lane if that is enabled. + if (UsesTFC) { + TFCLane = OldBitsSet; + } + // Try to figure out the used register components for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); I != E; ++I) { @@ -7970,28 +9011,49 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, // set, etc. Lane = SubIdx2Lane(I->getConstantOperandVal(1)); - // Set which texture component corresponds to the lane. - unsigned Comp; - for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { - Comp = countTrailingZeros(Dmask); - Dmask &= ~(1 << Comp); - } + // Check if the use is for the TFE/LWE generated result at VGPRn+1. + if (UsesTFC && Lane == TFCLane) { + Users[Lane] = *I; + } else { + // Set which texture component corresponds to the lane. + unsigned Comp; + for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) { + Comp = countTrailingZeros(Dmask); + Dmask &= ~(1 << Comp); + } - // Abort if we have more than one user per component - if (Users[Lane]) - return Node; + // Abort if we have more than one user per component. + if (Users[Lane]) + return Node; - Users[Lane] = *I; - NewDmask |= 1 << Comp; + Users[Lane] = *I; + NewDmask |= 1 << Comp; + } } + // Don't allow 0 dmask, as hardware assumes one channel enabled. + bool NoChannels = !NewDmask; + if (NoChannels) { + // If the original dmask has one channel - then nothing to do + if (OldBitsSet == 1) + return Node; + // Use an arbitrary dmask - required for the instruction to work + NewDmask = 1; + } // Abort if there's no change if (NewDmask == OldDmask) return Node; unsigned BitsSet = countPopulation(NewDmask); - int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet); + // Check for TFE or LWE - increase the number of channels by one to account + // for the extra return value + // This will need adjustment for D16 if this is also included in + // adjustWriteMask (this function) but at present D16 are excluded. + unsigned NewChannels = BitsSet + UsesTFC; + + int NewOpcode = + AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels); assert(NewOpcode != -1 && NewOpcode != static_cast<int>(Node->getMachineOpcode()) && "failed to find equivalent MIMG op"); @@ -8004,8 +9066,9 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT(); - MVT ResultVT = BitsSet == 1 ? - SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet); + MVT ResultVT = NewChannels == 1 ? + SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 : + NewChannels == 5 ? 8 : NewChannels); SDVTList NewVTList = HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT); @@ -8015,11 +9078,11 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, if (HasChain) { // Update chain. - NewNode->setMemRefs(Node->memoperands_begin(), Node->memoperands_end()); + DAG.setNodeMemRefs(NewNode, Node->memoperands()); DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1)); } - if (BitsSet == 1) { + if (NewChannels == 1) { assert(Node->hasNUsesOfValue(1, 0)); SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node), Users[Lane]->getValueType(0), @@ -8029,19 +9092,24 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, } // Update the users of the node with the new indices - for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { + for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) { SDNode *User = Users[i]; - if (!User) - continue; - - SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); - DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); + if (!User) { + // Handle the special case of NoChannels. We set NewDmask to 1 above, but + // Users[0] is still nullptr because channel 0 doesn't really have a use. + if (i || !NoChannels) + continue; + } else { + SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); + DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); + } switch (Idx) { default: break; case AMDGPU::sub0: Idx = AMDGPU::sub1; break; case AMDGPU::sub1: Idx = AMDGPU::sub2; break; case AMDGPU::sub2: Idx = AMDGPU::sub3; break; + case AMDGPU::sub3: Idx = AMDGPU::sub4; break; } } @@ -8457,49 +9525,56 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits); } +LLVM_ATTRIBUTE_UNUSED +static bool isCopyFromRegOfInlineAsm(const SDNode *N) { + assert(N->getOpcode() == ISD::CopyFromReg); + do { + // Follow the chain until we find an INLINEASM node. + N = N->getOperand(0).getNode(); + if (N->getOpcode() == ISD::INLINEASM) + return true; + } while (N->getOpcode() == ISD::CopyFromReg); + return false; +} + bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, - FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const + FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const { switch (N->getOpcode()) { - case ISD::Register: case ISD::CopyFromReg: { - const RegisterSDNode *R = nullptr; - if (N->getOpcode() == ISD::Register) { - R = dyn_cast<RegisterSDNode>(N); - } - else { - R = dyn_cast<RegisterSDNode>(N->getOperand(1)); - } - if (R) - { - const MachineFunction * MF = FLI->MF; - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); - const MachineRegisterInfo &MRI = MF->getRegInfo(); - const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo(); - unsigned Reg = R->getReg(); - if (TRI.isPhysicalRegister(Reg)) - return TRI.isVGPR(MRI, Reg); + const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1)); + const MachineFunction * MF = FLI->MF; + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo(); + unsigned Reg = R->getReg(); + if (TRI.isPhysicalRegister(Reg)) + return !TRI.isSGPRReg(MRI, Reg); - if (MRI.isLiveIn(Reg)) { - // workitem.id.x workitem.id.y workitem.id.z - // Any VGPR formal argument is also considered divergent - if (TRI.isVGPR(MRI, Reg)) - return true; - // Formal arguments of non-entry functions - // are conservatively considered divergent - else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv())) - return true; - } - return !DA || DA->isDivergent(FLI->getValueFromVirtualReg(Reg)); + if (MRI.isLiveIn(Reg)) { + // workitem.id.x workitem.id.y workitem.id.z + // Any VGPR formal argument is also considered divergent + if (!TRI.isSGPRReg(MRI, Reg)) + return true; + // Formal arguments of non-entry functions + // are conservatively considered divergent + else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv())) + return true; + return false; } + const Value *V = FLI->getValueFromVirtualReg(Reg); + if (V) + return KDA->isDivergent(V); + assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N)); + return !TRI.isSGPRReg(MRI, Reg); } break; case ISD::LOAD: { - const LoadSDNode *L = dyn_cast<LoadSDNode>(N); - if (L->getMemOperand()->getAddrSpace() == - Subtarget->getAMDGPUAS().PRIVATE_ADDRESS) - return true; + const LoadSDNode *L = cast<LoadSDNode>(N); + unsigned AS = L->getAddressSpace(); + // A flat load may access private memory. + return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; } break; case ISD::CALLSEQ_END: return true; @@ -8522,3 +9597,30 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, } return false; } + +bool SITargetLowering::denormalsEnabledForType(EVT VT) const { + switch (VT.getScalarType().getSimpleVT().SimpleTy) { + case MVT::f32: + return Subtarget->hasFP32Denormals(); + case MVT::f64: + return Subtarget->hasFP64Denormals(); + case MVT::f16: + return Subtarget->hasFP16Denormals(); + default: + return false; + } +} + +bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, + const SelectionDAG &DAG, + bool SNaN, + unsigned Depth) const { + if (Op.getOpcode() == AMDGPUISD::CLAMP) { + if (Subtarget->enableDX10Clamp()) + return true; // Clamped to 0. + return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); + } + + return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG, + SNaN, Depth); +} diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 5b3d49b3d8e30..bcef519ee6635 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -60,11 +60,22 @@ private: MVT VT, unsigned Offset) const; SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, SelectionDAG &DAG) const; + SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset, + SDValue GLC, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; + // The raw.tbuffer and struct.tbuffer intrinsics have two offset args: offset + // (the offset that is included in bounds checking and swizzling, to be split + // between the instruction's voffset and immoffset fields) and soffset (the + // offset that is excluded from bounds checking and swizzling, to go in the + // instruction's soffset field). This function takes the first kind of + // offset and figures out how to split it between voffset and immoffset. + std::pair<SDValue, SDValue> splitBufferOffsets(SDValue Offset, + SelectionDAG &DAG) const; + SDValue widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; @@ -81,7 +92,7 @@ private: SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M, - SelectionDAG &DAG, + SelectionDAG &DAG, ArrayRef<SDValue> Ops, bool IsIntrinsic = false) const; SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const; @@ -99,6 +110,7 @@ private: /// Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; SDValue getSegmentAperture(unsigned AS, const SDLoc &DL, SelectionDAG &DAG) const; @@ -130,6 +142,8 @@ private: SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue getCanonicalConstantFP(SelectionDAG &DAG, const SDLoc &SL, EVT VT, + const APFloat &C) const; SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, @@ -140,7 +154,7 @@ private: SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performBuildVectorCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; @@ -156,7 +170,6 @@ private: SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const; bool isLegalFlatAddressingMode(const AddrMode &AM) const; - bool isLegalGlobalAddressingMode(const AddrMode &AM) const; bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; unsigned isCFIntrinsic(const SDNode *Intr) const; @@ -175,6 +188,12 @@ private: /// global value \p GV, false otherwise. bool shouldEmitPCReloc(const GlobalValue *GV) const; + // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the + // three offsets (voffset, soffset and instoffset) into the SDValue[3] array + // pointed to by Offsets. + void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, + SDValue *Offsets, unsigned Align = 4) const; + public: SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI); @@ -192,6 +211,7 @@ public: SmallVectorImpl<Value*> &/*Ops*/, Type *&/*AccessTy*/) const override; + bool isLegalGlobalAddressingMode(const AddrMode &AM) const; bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I = nullptr) const override; @@ -215,7 +235,7 @@ public: bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; TargetLoweringBase::LegalizeTypeAction - getPreferredVectorAction(EVT VT) const override; + getPreferredVectorAction(MVT VT) const override; bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; @@ -248,11 +268,11 @@ public: void passSpecialInputs( CallLoweringInfo &CLI, + CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, SmallVectorImpl<SDValue> &MemOpChains, - SDValue Chain, - SDValue StackPtr) const; + SDValue Chain) const; SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, @@ -322,7 +342,16 @@ public: unsigned Depth = 0) const override; bool isSDNodeSourceOfDivergence(const SDNode *N, - FunctionLoweringInfo *FLI, DivergenceAnalysis *DA) const override; + FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override; + + bool isCanonicalized(SelectionDAG &DAG, SDValue Op, + unsigned MaxDepth = 5) const; + bool denormalsEnabledForType(EVT VT) const; + + bool isKnownNeverNaNForTargetNode(SDValue Op, + const SelectionDAG &DAG, + bool SNaN = false, + unsigned Depth = 0) const override; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp index dc9397cf7b85e..ba21a5ce1293a 100644 --- a/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -66,6 +66,8 @@ private: bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); + bool optimizeVccBranch(MachineInstr &MI) const; + public: static char ID; @@ -320,6 +322,96 @@ bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, return true; } +bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const { + // Match: + // sreg = -1 + // vcc = S_AND_B64 exec, sreg + // S_CBRANCH_VCC[N]Z + // => + // S_CBRANCH_EXEC[N]Z + bool Changed = false; + MachineBasicBlock &MBB = *MI.getParent(); + const unsigned CondReg = AMDGPU::VCC; + const unsigned ExecReg = AMDGPU::EXEC; + const unsigned And = AMDGPU::S_AND_B64; + + MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), + E = MBB.rend(); + bool ReadsCond = false; + unsigned Threshold = 5; + for (++A ; A != E ; ++A) { + if (!--Threshold) + return false; + if (A->modifiesRegister(ExecReg, TRI)) + return false; + if (A->modifiesRegister(CondReg, TRI)) { + if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And) + return false; + break; + } + ReadsCond |= A->readsRegister(CondReg, TRI); + } + if (A == E) + return false; + + MachineOperand &Op1 = A->getOperand(1); + MachineOperand &Op2 = A->getOperand(2); + if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { + TII->commuteInstruction(*A); + Changed = true; + } + if (Op1.getReg() != ExecReg) + return Changed; + if (Op2.isImm() && Op2.getImm() != -1) + return Changed; + + unsigned SReg = AMDGPU::NoRegister; + if (Op2.isReg()) { + SReg = Op2.getReg(); + auto M = std::next(A); + bool ReadsSreg = false; + for ( ; M != E ; ++M) { + if (M->definesRegister(SReg, TRI)) + break; + if (M->modifiesRegister(SReg, TRI)) + return Changed; + ReadsSreg |= M->readsRegister(SReg, TRI); + } + if (M == E || + !M->isMoveImmediate() || + !M->getOperand(1).isImm() || + M->getOperand(1).getImm() != -1) + return Changed; + // First if sreg is only used in and instruction fold the immediate + // into that and. + if (!ReadsSreg && Op2.isKill()) { + A->getOperand(2).ChangeToImmediate(-1); + M->eraseFromParent(); + } + } + + if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) && + MI.killsRegister(CondReg, TRI)) + A->eraseFromParent(); + + bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; + if (SReg == ExecReg) { + if (IsVCCZ) { + MI.eraseFromParent(); + return true; + } + MI.setDesc(TII->get(AMDGPU::S_BRANCH)); + } else { + MI.setDesc(TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ + : AMDGPU::S_CBRANCH_EXECNZ)); + } + + MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI)); + MI.addImplicitDefUseOperands(*MBB.getParent()); + + return true; +} + bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); @@ -384,7 +476,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { kill(MI); if (ExecBranchStack.empty()) { - if (skipIfDead(MI, *NextBB)) { + if (NextBB != BE && skipIfDead(MI, *NextBB)) { HaveSkipBlock = true; NextBB = std::next(BI); BE = MF.end(); @@ -417,6 +509,11 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { } break; + case AMDGPU::S_CBRANCH_VCCZ: + case AMDGPU::S_CBRANCH_VCCNZ: + MadeChange |= optimizeVccBranch(MI); + break; + default: break; } diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index d456e3d9b94d0..afc0b44676109 100644 --- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -13,6 +13,14 @@ /// Memory reads and writes are issued asynchronously, so we need to insert /// S_WAITCNT instructions when we want to access any of their results or /// overwrite any register that's used asynchronously. +/// +/// TODO: This pass currently keeps one timeline per hardware counter. A more +/// finely-grained approach that keeps one timeline per event type could +/// sometimes get away with generating weaker s_waitcnt instructions. For +/// example, when both SMEM and LDS are in flight and we need to wait for +/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient, +/// but the pass will currently generate a conservative lgkmcnt(0) because +/// multiple event types are in flight. // //===----------------------------------------------------------------------===// @@ -33,7 +41,6 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -69,6 +76,25 @@ static cl::opt<unsigned> ForceEmitZeroFlag( namespace { +template <typename EnumT> +class enum_iterator + : public iterator_facade_base<enum_iterator<EnumT>, + std::forward_iterator_tag, const EnumT> { + EnumT Value; +public: + enum_iterator() = default; + enum_iterator(EnumT Value) : Value(Value) {} + + enum_iterator &operator++() { + Value = static_cast<EnumT>(Value + 1); + return *this; + } + + bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; } + + EnumT operator*() const { return Value; } +}; + // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether // s_waitcnt instruction needs to be emited. @@ -77,12 +103,17 @@ namespace { enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS }; +iterator_range<enum_iterator<InstCounterType>> inst_counter_types() { + return make_range(enum_iterator<InstCounterType>(VM_CNT), + enum_iterator<InstCounterType>(NUM_INST_CNTS)); +} + using RegInterval = std::pair<signed, signed>; struct { - int32_t VmcntMax; - int32_t ExpcntMax; - int32_t LgkmcntMax; + uint32_t VmcntMax; + uint32_t ExpcntMax; + uint32_t LgkmcntMax; int32_t NumVGPRsMax; int32_t NumSGPRsMax; } HardwareLimits; @@ -108,6 +139,14 @@ enum WaitEventType { NUM_WAIT_EVENTS, }; +static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = { + (1 << VMEM_ACCESS), + (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) | + (1 << SQ_MESSAGE), + (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) | + (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS), +}; + // The mapping is: // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots @@ -122,30 +161,38 @@ enum RegisterMapping { NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts. }; -#define ForAllWaitEventType(w) \ - for (enum WaitEventType w = (enum WaitEventType)0; \ - (w) < (enum WaitEventType)NUM_WAIT_EVENTS; \ - (w) = (enum WaitEventType)((w) + 1)) +void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { + switch (T) { + case VM_CNT: + Wait.VmCnt = std::min(Wait.VmCnt, Count); + break; + case EXP_CNT: + Wait.ExpCnt = std::min(Wait.ExpCnt, Count); + break; + case LGKM_CNT: + Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count); + break; + default: + llvm_unreachable("bad InstCounterType"); + } +} -// This is a per-basic-block object that maintains current score brackets -// of each wait counter, and a per-register scoreboard for each wait counter. +// This objects maintains the current score brackets of each wait counter, and +// a per-register scoreboard for each wait counter. +// // We also maintain the latest score for every event type that can change the // waitcnt in order to know if there are multiple types of events within // the brackets. When multiple types of event happen in the bracket, // wait count may get decreased out of order, therefore we need to put in // "s_waitcnt 0" before use. -class BlockWaitcntBrackets { +class WaitcntBrackets { public: - BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) { - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { + WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) { + for (auto T : inst_counter_types()) memset(VgprScores[T], 0, sizeof(VgprScores[T])); - } } - ~BlockWaitcntBrackets() = default; - - static int32_t getWaitCountMax(InstCounterType T) { + static uint32_t getWaitCountMax(InstCounterType T) { switch (T) { case VM_CNT: return HardwareLimits.VmcntMax; @@ -159,33 +206,14 @@ public: return 0; } - void setScoreLB(InstCounterType T, int32_t Val) { - assert(T < NUM_INST_CNTS); - if (T >= NUM_INST_CNTS) - return; - ScoreLBs[T] = Val; - } - - void setScoreUB(InstCounterType T, int32_t Val) { - assert(T < NUM_INST_CNTS); - if (T >= NUM_INST_CNTS) - return; - ScoreUBs[T] = Val; - if (T == EXP_CNT) { - int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT)); - if (ScoreLBs[T] < UB) - ScoreLBs[T] = UB; - } - } - - int32_t getScoreLB(InstCounterType T) { + uint32_t getScoreLB(InstCounterType T) const { assert(T < NUM_INST_CNTS); if (T >= NUM_INST_CNTS) return 0; return ScoreLBs[T]; } - int32_t getScoreUB(InstCounterType T) { + uint32_t getScoreUB(InstCounterType T) const { assert(T < NUM_INST_CNTS); if (T >= NUM_INST_CNTS) return 0; @@ -194,89 +222,56 @@ public: // Mapping from event to counter. InstCounterType eventCounter(WaitEventType E) { - switch (E) { - case VMEM_ACCESS: + if (E == VMEM_ACCESS) return VM_CNT; - case LDS_ACCESS: - case GDS_ACCESS: - case SQ_MESSAGE: - case SMEM_ACCESS: + if (WaitEventMaskForInst[LGKM_CNT] & (1 << E)) return LGKM_CNT; - case EXP_GPR_LOCK: - case GDS_GPR_LOCK: - case VMW_GPR_LOCK: - case EXP_POS_ACCESS: - case EXP_PARAM_ACCESS: - return EXP_CNT; - default: - llvm_unreachable("unhandled event type"); - } - return NUM_INST_CNTS; - } - - void setRegScore(int GprNo, InstCounterType T, int32_t Val) { - if (GprNo < NUM_ALL_VGPRS) { - if (GprNo > VgprUB) { - VgprUB = GprNo; - } - VgprScores[T][GprNo] = Val; - } else { - assert(T == LGKM_CNT); - if (GprNo - NUM_ALL_VGPRS > SgprUB) { - SgprUB = GprNo - NUM_ALL_VGPRS; - } - SgprScores[GprNo - NUM_ALL_VGPRS] = Val; - } + assert(WaitEventMaskForInst[EXP_CNT] & (1 << E)); + return EXP_CNT; } - int32_t getRegScore(int GprNo, InstCounterType T) { + uint32_t getRegScore(int GprNo, InstCounterType T) { if (GprNo < NUM_ALL_VGPRS) { return VgprScores[T][GprNo]; } + assert(T == LGKM_CNT); return SgprScores[GprNo - NUM_ALL_VGPRS]; } void clear() { memset(ScoreLBs, 0, sizeof(ScoreLBs)); memset(ScoreUBs, 0, sizeof(ScoreUBs)); - memset(EventUBs, 0, sizeof(EventUBs)); - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { + PendingEvents = 0; + memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents)); + for (auto T : inst_counter_types()) memset(VgprScores[T], 0, sizeof(VgprScores[T])); - } memset(SgprScores, 0, sizeof(SgprScores)); } + bool merge(const WaitcntBrackets &Other); + RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII, const MachineRegisterInfo *MRI, const SIRegisterInfo *TRI, unsigned OpNo, bool Def) const; - void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII, - const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, - unsigned OpNo, int32_t Val); - - void setWaitAtBeginning() { WaitAtBeginning = true; } - void clearWaitAtBeginning() { WaitAtBeginning = false; } - bool getWaitAtBeginning() const { return WaitAtBeginning; } - void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; } int32_t getMaxVGPR() const { return VgprUB; } int32_t getMaxSGPR() const { return SgprUB; } - int32_t getEventUB(enum WaitEventType W) const { - assert(W < NUM_WAIT_EVENTS); - return EventUBs[W]; - } - - bool counterOutOfOrder(InstCounterType T); - unsigned int updateByWait(InstCounterType T, int ScoreToWait); + bool counterOutOfOrder(InstCounterType T) const; + bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; + bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const; + void determineWait(InstCounterType T, uint32_t ScoreToWait, + AMDGPU::Waitcnt &Wait) const; + void applyWaitcnt(const AMDGPU::Waitcnt &Wait); + void applyWaitcnt(InstCounterType T, unsigned Count); void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, WaitEventType E, MachineInstr &MI); - bool hasPendingSMEM() const { - return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] && - EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]); + bool hasPending() const { return PendingEvents != 0; } + bool hasPendingEvent(WaitEventType E) const { + return PendingEvents & (1 << E); } bool hasPendingFlat() const { @@ -291,75 +286,71 @@ public: LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT]; } - int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; } - - void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; } + void print(raw_ostream &); + void dump() { print(dbgs()); } - bool getRevisitLoop() const { return RevisitLoop; } - void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; } +private: + struct MergeInfo { + uint32_t OldLB; + uint32_t OtherLB; + uint32_t MyShift; + uint32_t OtherShift; + }; + static bool mergeScore(const MergeInfo &M, uint32_t &Score, + uint32_t OtherScore); - void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; } - int32_t getPostOrder() const { return PostOrder; } + void setScoreLB(InstCounterType T, uint32_t Val) { + assert(T < NUM_INST_CNTS); + if (T >= NUM_INST_CNTS) + return; + ScoreLBs[T] = Val; + } - void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; } - void clearWaitcnt() { Waitcnt = nullptr; } - MachineInstr *getWaitcnt() const { return Waitcnt; } + void setScoreUB(InstCounterType T, uint32_t Val) { + assert(T < NUM_INST_CNTS); + if (T >= NUM_INST_CNTS) + return; + ScoreUBs[T] = Val; + if (T == EXP_CNT) { + uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT); + if (ScoreLBs[T] < UB && UB < ScoreUBs[T]) + ScoreLBs[T] = UB; + } + } - bool mixedExpTypes() const { return MixedExpTypes; } - void setMixedExpTypes(bool MixedExpTypesIn) { - MixedExpTypes = MixedExpTypesIn; + void setRegScore(int GprNo, InstCounterType T, uint32_t Val) { + if (GprNo < NUM_ALL_VGPRS) { + if (GprNo > VgprUB) { + VgprUB = GprNo; + } + VgprScores[T][GprNo] = Val; + } else { + assert(T == LGKM_CNT); + if (GprNo - NUM_ALL_VGPRS > SgprUB) { + SgprUB = GprNo - NUM_ALL_VGPRS; + } + SgprScores[GprNo - NUM_ALL_VGPRS] = Val; + } } - void print(raw_ostream &); - void dump() { print(dbgs()); } + void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII, + const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, + unsigned OpNo, uint32_t Val); -private: const GCNSubtarget *ST = nullptr; - bool WaitAtBeginning = false; - bool RevisitLoop = false; - bool MixedExpTypes = false; - int32_t PostOrder = 0; - MachineInstr *Waitcnt = nullptr; - int32_t ScoreLBs[NUM_INST_CNTS] = {0}; - int32_t ScoreUBs[NUM_INST_CNTS] = {0}; - int32_t EventUBs[NUM_WAIT_EVENTS] = {0}; + uint32_t ScoreLBs[NUM_INST_CNTS] = {0}; + uint32_t ScoreUBs[NUM_INST_CNTS] = {0}; + uint32_t PendingEvents = 0; + bool MixedPendingEvents[NUM_INST_CNTS] = {false}; // Remember the last flat memory operation. - int32_t LastFlat[NUM_INST_CNTS] = {0}; + uint32_t LastFlat[NUM_INST_CNTS] = {0}; // wait_cnt scores for every vgpr. // Keep track of the VgprUB and SgprUB to make merge at join efficient. int32_t VgprUB = 0; int32_t SgprUB = 0; - int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS]; + uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS]; // Wait cnt scores for every sgpr, only lgkmcnt is relevant. - int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0}; -}; - -// This is a per-loop-region object that records waitcnt status at the end of -// loop footer from the previous iteration. We also maintain an iteration -// count to track the number of times the loop has been visited. When it -// doesn't converge naturally, we force convergence by inserting s_waitcnt 0 -// at the end of the loop footer. -class LoopWaitcntData { -public: - LoopWaitcntData() = default; - ~LoopWaitcntData() = default; - - void incIterCnt() { IterCnt++; } - void resetIterCnt() { IterCnt = 0; } - unsigned getIterCnt() { return IterCnt; } - - void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; } - MachineInstr *getWaitcnt() const { return LfWaitcnt; } - - void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt << '\n';); } - -private: - // s_waitcnt added at the end of loop footer to stablize wait scores - // at the end of the loop footer. - MachineInstr *LfWaitcnt = nullptr; - // Number of iterations the loop has been visited, not including the initial - // walk over. - int32_t IterCnt = 0; + uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0}; }; class SIInsertWaitcnts : public MachineFunctionPass { @@ -368,22 +359,21 @@ private: const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; const MachineRegisterInfo *MRI = nullptr; - const MachineLoopInfo *MLI = nullptr; - AMDGPU::IsaInfo::IsaVersion IV; - AMDGPUAS AMDGPUASI; + AMDGPU::IsaVersion IV; - DenseSet<MachineBasicBlock *> BlockVisitedSet; DenseSet<MachineInstr *> TrackedWaitcntSet; DenseSet<MachineInstr *> VCCZBugHandledSet; - DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>> - BlockWaitcntBracketsMap; + struct BlockInfo { + MachineBasicBlock *MBB; + std::unique_ptr<WaitcntBrackets> Incoming; + bool Dirty = true; - std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet; + explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {} + }; - DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap; - - std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets; + std::vector<BlockInfo> BlockInfos; // by reverse post-order traversal index + DenseMap<MachineBasicBlock *, unsigned> RpotIdxMap; // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0 // because of amdgpu-waitcnt-forcezero flag @@ -407,20 +397,11 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<MachineLoopInfo>(); MachineFunctionPass::getAnalysisUsage(AU); } - void addKillWaitBracket(BlockWaitcntBrackets *Bracket) { - // The waitcnt information is copied because it changes as the block is - // traversed. - KillWaitBrackets.push_back( - llvm::make_unique<BlockWaitcntBrackets>(*Bracket)); - } - bool isForceEmitWaitcnt() const { - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) + for (auto T : inst_counter_types()) if (ForceEmitWaitcnt[T]) return true; return false; @@ -454,27 +435,22 @@ public: } bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; - void generateWaitcntInstBefore(MachineInstr &MI, - BlockWaitcntBrackets *ScoreBrackets); + bool generateWaitcntInstBefore(MachineInstr &MI, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr); void updateEventWaitcntAfter(MachineInstr &Inst, - BlockWaitcntBrackets *ScoreBrackets); - void mergeInputScoreBrackets(MachineBasicBlock &Block); - bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block); - unsigned countNumBottomBlocks(const MachineLoop *Loop); - void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block); - void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst); - bool isWaitcntStronger(unsigned LHS, unsigned RHS); - unsigned combineWaitcnt(unsigned LHS, unsigned RHS); + WaitcntBrackets *ScoreBrackets); + bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, + WaitcntBrackets &ScoreBrackets); }; } // end anonymous namespace -RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI, - const SIInstrInfo *TII, - const MachineRegisterInfo *MRI, - const SIRegisterInfo *TRI, - unsigned OpNo, - bool Def) const { +RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, + const SIInstrInfo *TII, + const MachineRegisterInfo *MRI, + const SIRegisterInfo *TRI, + unsigned OpNo, bool Def) const { const MachineOperand &Op = MI->getOperand(OpNo); if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) || (Def && !Op.isDef())) @@ -512,11 +488,11 @@ RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI, return Result; } -void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, - unsigned OpNo, int32_t Val) { +void WaitcntBrackets::setExpScore(const MachineInstr *MI, + const SIInstrInfo *TII, + const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, unsigned OpNo, + uint32_t Val) { RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false); LLVM_DEBUG({ const MachineOperand &Opnd = MI->getOperand(OpNo); @@ -527,26 +503,26 @@ void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI, } } -void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, - WaitEventType E, MachineInstr &Inst) { +void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, + const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, + WaitEventType E, MachineInstr &Inst) { const MachineRegisterInfo &MRIA = *MRI; InstCounterType T = eventCounter(E); - int32_t CurrScore = getScoreUB(T) + 1; - // EventUB and ScoreUB need to be update regardless if this event changes - // the score of a register or not. + uint32_t CurrScore = getScoreUB(T) + 1; + if (CurrScore == 0) + report_fatal_error("InsertWaitcnt score wraparound"); + // PendingEvents and ScoreUB need to be update regardless if this event + // changes the score of a register or not. // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. - EventUBs[E] = CurrScore; + if (!hasPendingEvent(E)) { + if (PendingEvents & WaitEventMaskForInst[T]) + MixedPendingEvents[T] = true; + PendingEvents |= 1 << E; + } setScoreUB(T, CurrScore); if (T == EXP_CNT) { - // Check for mixed export types. If they are mixed, then a waitcnt exp(0) - // is required. - if (!MixedExpTypes) { - MixedExpTypes = counterOutOfOrder(EXP_CNT); - } - // Put score on the source vgprs. If this is a store, just use those // specific register(s). if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) { @@ -671,12 +647,11 @@ void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII, } } -void BlockWaitcntBrackets::print(raw_ostream &OS) { +void WaitcntBrackets::print(raw_ostream &OS) { OS << '\n'; - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - int LB = getScoreLB(T); - int UB = getScoreUB(T); + for (auto T : inst_counter_types()) { + uint32_t LB = getScoreLB(T); + uint32_t UB = getScoreUB(T); switch (T) { case VM_CNT: @@ -696,10 +671,10 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) { if (LB < UB) { // Print vgpr scores. for (int J = 0; J <= getMaxVGPR(); J++) { - int RegScore = getRegScore(J, T); + uint32_t RegScore = getRegScore(J, T); if (RegScore <= LB) continue; - int RelScore = RegScore - LB - 1; + uint32_t RelScore = RegScore - LB - 1; if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) { OS << RelScore << ":v" << J << " "; } else { @@ -709,10 +684,10 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) { // Also need to print sgpr scores for lgkm_cnt. if (T == LGKM_CNT) { for (int J = 0; J <= getMaxSGPR(); J++) { - int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); + uint32_t RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); if (RegScore <= LB) continue; - int RelScore = RegScore - LB - 1; + uint32_t RelScore = RegScore - LB - 1; OS << RelScore << ":s" << J << " "; } } @@ -722,23 +697,31 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) { OS << '\n'; } -unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T, - int ScoreToWait) { - unsigned int NeedWait = 0; - if (ScoreToWait == -1) { - // The score to wait is unknown. This implies that it was not encountered - // during the path of the CFG walk done during the current traversal but - // may be seen on a different path. Emit an s_wait counter with a - // conservative value of 0 for the counter. - NeedWait = CNT_MASK(T); - setScoreLB(T, getScoreUB(T)); - return NeedWait; - } +/// Simplify the waitcnt, in the sense of removing redundant counts, and return +/// whether a waitcnt instruction is needed at all. +bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { + return simplifyWaitcnt(VM_CNT, Wait.VmCnt) | + simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) | + simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt); +} + +bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T, + unsigned &Count) const { + const uint32_t LB = getScoreLB(T); + const uint32_t UB = getScoreUB(T); + if (Count < UB && UB - Count > LB) + return true; + Count = ~0u; + return false; +} + +void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait, + AMDGPU::Waitcnt &Wait) const { // If the score of src_operand falls within the bracket, we need an // s_waitcnt instruction. - const int32_t LB = getScoreLB(T); - const int32_t UB = getScoreUB(T); + const uint32_t LB = getScoreLB(T); + const uint32_t UB = getScoreUB(T); if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { if ((T == VM_CNT || T == LGKM_CNT) && hasPendingFlat() && @@ -746,90 +729,46 @@ unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T, // If there is a pending FLAT operation, and this is a VMem or LGKM // waitcnt and the target can report early completion, then we need // to force a waitcnt 0. - NeedWait = CNT_MASK(T); - setScoreLB(T, getScoreUB(T)); + addWait(Wait, T, 0); } else if (counterOutOfOrder(T)) { // Counter can get decremented out-of-order when there // are multiple types event in the bracket. Also emit an s_wait counter // with a conservative value of 0 for the counter. - NeedWait = CNT_MASK(T); - setScoreLB(T, getScoreUB(T)); + addWait(Wait, T, 0); } else { - NeedWait = CNT_MASK(T); - setScoreLB(T, ScoreToWait); + addWait(Wait, T, UB - ScoreToWait); } } +} - return NeedWait; +void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { + applyWaitcnt(VM_CNT, Wait.VmCnt); + applyWaitcnt(EXP_CNT, Wait.ExpCnt); + applyWaitcnt(LGKM_CNT, Wait.LgkmCnt); } -// Where there are multiple types of event in the bracket of a counter, -// the decrement may go out of order. -bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) { - switch (T) { - case VM_CNT: - return false; - case LGKM_CNT: { - if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] && - EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) { - // Scalar memory read always can go out of order. - return true; - } - int NumEventTypes = 0; - if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] && - EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) { - NumEventTypes++; - } - if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] && - EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) { - NumEventTypes++; - } - if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] && - EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) { - NumEventTypes++; - } - if (NumEventTypes <= 1) { - return false; - } - break; +void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { + const uint32_t UB = getScoreUB(T); + if (Count >= UB) + return; + if (Count != 0) { + if (counterOutOfOrder(T)) + return; + setScoreLB(T, std::max(getScoreLB(T), UB - Count)); + } else { + setScoreLB(T, UB); + MixedPendingEvents[T] = false; + PendingEvents &= ~WaitEventMaskForInst[T]; } - case EXP_CNT: { - // If there has been a mixture of export types, then a waitcnt exp(0) is - // required. - if (MixedExpTypes) - return true; - int NumEventTypes = 0; - if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] && - EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { - NumEventTypes++; - } - if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] && - EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { - NumEventTypes++; - } - if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] && - EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) { - NumEventTypes++; - } - if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] && - EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) { - NumEventTypes++; - } - - if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] && - EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) { - NumEventTypes++; - } +} - if (NumEventTypes <= 1) { - return false; - } - break; - } - default: - break; - } - return true; +// Where there are multiple types of event in the bracket of a counter, +// the decrement may go out of order. +bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { + // Scalar memory read always can go out of order. + if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS)) + return true; + return MixedPendingEvents[T]; } INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, @@ -851,29 +790,6 @@ static bool readsVCCZ(const MachineInstr &MI) { !MI.getOperand(1).isUndef(); } -/// Given wait count encodings checks if LHS is stronger than RHS. -bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) { - if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS)) - return false; - if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS)) - return false; - if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS)) - return false; - return true; -} - -/// Given wait count encodings create a new encoding which is stronger -/// or equal to both. -unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) { - unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS), - AMDGPU::decodeVmcnt(IV, RHS)); - unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS), - AMDGPU::decodeLgkmcnt(IV, RHS)); - unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS), - AMDGPU::decodeExpcnt(IV, RHS)); - return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt); -} - /// Generate s_waitcnt instruction to be placed before cur_Inst. /// Instructions of a given type are returned in order, /// but instructions of different types can complete out of order. @@ -884,51 +800,23 @@ unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) { /// and if so what the value of each counter is. /// The "score bracket" is bound by the lower bound and upper bound /// scores (*_score_LB and *_score_ub respectively). -void SIInsertWaitcnts::generateWaitcntInstBefore( - MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) { - // To emit, or not to emit - that's the question! - // Start with an assumption that there is no need to emit. - unsigned int EmitWaitcnt = 0; - - // No need to wait before phi. If a phi-move exists, then the wait should - // has been inserted before the move. If a phi-move does not exist, then - // wait should be inserted before the real use. The same is true for - // sc-merge. It is not a coincident that all these cases correspond to the - // instructions that are skipped in the assembling loop. - bool NeedLineMapping = false; // TODO: Check on this. - - // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug - bool ForceEmitZeroWaitcnt = false; - +bool SIInsertWaitcnts::generateWaitcntInstBefore( + MachineInstr &MI, WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr) { setForceEmitWaitcnt(); bool IsForceEmitWaitcnt = isForceEmitWaitcnt(); - if (MI.isDebugInstr() && - // TODO: any other opcode? - !NeedLineMapping) { - return; - } + if (MI.isDebugInstr()) + return false; - // See if an s_waitcnt is forced at block entry, or is needed at - // program end. - if (ScoreBrackets->getWaitAtBeginning()) { - // Note that we have already cleared the state, so we don't need to update - // it. - ScoreBrackets->clearWaitAtBeginning(); - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - EmitWaitcnt |= CNT_MASK(T); - ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); - } - } + AMDGPU::Waitcnt Wait; // See if this instruction has a forced S_WAITCNT VM. // TODO: Handle other cases of NeedsWaitcntVmBefore() - else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || - MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || - MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) { - EmitWaitcnt |= - ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); + if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || + MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || + MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) { + Wait.VmCnt = 0; } // All waits must be resolved at call return. @@ -936,23 +824,14 @@ void SIInsertWaitcnts::generateWaitcntInstBefore( // with knowledge of the called routines. if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || MI.getOpcode() == AMDGPU::S_SETPC_B64_return) { - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { - ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); - EmitWaitcnt |= CNT_MASK(T); - } - } + Wait = AMDGPU::Waitcnt::allZero(); } // Resolve vm waits before gs-done. else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) == AMDGPU::SendMsg::ID_GS_DONE)) { - if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) { - ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); - EmitWaitcnt |= CNT_MASK(VM_CNT); - } + Wait.VmCnt = 0; } #if 0 // TODO: the following blocks of logic when we have fence. else if (MI.getOpcode() == SC_FENCE) { @@ -1016,14 +895,12 @@ void SIInsertWaitcnts::generateWaitcntInstBefore( if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { // Export and GDS are tracked individually, either may trigger a waitcnt // for EXEC. - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK)); + if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) || + ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) || + ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) || + ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) { + Wait.ExpCnt = 0; + } } #if 0 // TODO: the following code to handle CALL. @@ -1051,27 +928,27 @@ void SIInsertWaitcnts::generateWaitcntInstBefore( // instruction. for (const MachineMemOperand *Memop : MI.memoperands()) { unsigned AS = Memop->getAddrSpace(); - if (AS != AMDGPUASI.LOCAL_ADDRESS) + if (AS != AMDGPUAS::LOCAL_ADDRESS) continue; unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; // VM_CNT is only relevant to vgpr or LDS. - EmitWaitcnt |= ScoreBrackets->updateByWait( - VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); + ScoreBrackets.determineWait( + VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); } for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { const MachineOperand &Op = MI.getOperand(I); const MachineRegisterInfo &MRIA = *MRI; RegInterval Interval = - ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false); + ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false); for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { if (TRI->isVGPR(MRIA, Op.getReg())) { // VM_CNT is only relevant to vgpr or LDS. - EmitWaitcnt |= ScoreBrackets->updateByWait( - VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); + ScoreBrackets.determineWait( + VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); } - EmitWaitcnt |= ScoreBrackets->updateByWait( - LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); + ScoreBrackets.determineWait( + LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); } } // End of for loop that looks at all source operands to decide vm_wait_cnt @@ -1086,29 +963,29 @@ void SIInsertWaitcnts::generateWaitcntInstBefore( // FIXME: Should not be relying on memoperands. for (const MachineMemOperand *Memop : MI.memoperands()) { unsigned AS = Memop->getAddrSpace(); - if (AS != AMDGPUASI.LOCAL_ADDRESS) + if (AS != AMDGPUAS::LOCAL_ADDRESS) continue; unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; - EmitWaitcnt |= ScoreBrackets->updateByWait( - VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT)); + ScoreBrackets.determineWait( + VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + ScoreBrackets.determineWait( + EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); } } for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { MachineOperand &Def = MI.getOperand(I); const MachineRegisterInfo &MRIA = *MRI; RegInterval Interval = - ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true); + ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true); for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { if (TRI->isVGPR(MRIA, Def.getReg())) { - EmitWaitcnt |= ScoreBrackets->updateByWait( - VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT)); + ScoreBrackets.determineWait( + VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait); + ScoreBrackets.determineWait( + EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait); } - EmitWaitcnt |= ScoreBrackets->updateByWait( - LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); + ScoreBrackets.determineWait( + LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait); } } // End of for loop that looks at all dest operands. } @@ -1119,182 +996,79 @@ void SIInsertWaitcnts::generateWaitcntInstBefore( // requiring a WAITCNT beforehand. if (MI.getOpcode() == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier()) { - EmitWaitcnt |= - ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - EmitWaitcnt |= ScoreBrackets->updateByWait( - LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT)); + Wait = AMDGPU::Waitcnt::allZero(); } // TODO: Remove this work-around, enable the assert for Bug 457939 // after fixing the scheduler. Also, the Shader Compiler code is // independent of target. if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) { - if (ScoreBrackets->getScoreLB(LGKM_CNT) < - ScoreBrackets->getScoreUB(LGKM_CNT) && - ScoreBrackets->hasPendingSMEM()) { - // Wait on everything, not just LGKM. vccz reads usually come from - // terminators, and we always wait on everything at the end of the - // block, so if we only wait on LGKM here, we might end up with - // another s_waitcnt inserted right after this if there are non-LGKM - // instructions still outstanding. - // FIXME: this is too conservative / the comment is wrong. - // We don't wait on everything at the end of the block and we combine - // waitcnts so we should never have back-to-back waitcnts. - ForceEmitZeroWaitcnt = true; - EmitWaitcnt = true; + if (ScoreBrackets.getScoreLB(LGKM_CNT) < + ScoreBrackets.getScoreUB(LGKM_CNT) && + ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { + Wait.LgkmCnt = 0; } } - // Does this operand processing indicate s_wait counter update? - if (EmitWaitcnt || IsForceEmitWaitcnt) { - int CntVal[NUM_INST_CNTS]; - - bool UseDefaultWaitcntStrategy = true; - if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) { - // Force all waitcnts to 0. - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); - } - CntVal[VM_CNT] = 0; - CntVal[EXP_CNT] = 0; - CntVal[LGKM_CNT] = 0; - UseDefaultWaitcntStrategy = false; - } - - if (UseDefaultWaitcntStrategy) { - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - if (EmitWaitcnt & CNT_MASK(T)) { - int Delta = - ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T); - int MaxDelta = ScoreBrackets->getWaitCountMax(T); - if (Delta >= MaxDelta) { - Delta = -1; - if (T != EXP_CNT) { - ScoreBrackets->setScoreLB( - T, ScoreBrackets->getScoreUB(T) - MaxDelta); - } - EmitWaitcnt &= ~CNT_MASK(T); - } - CntVal[T] = Delta; - } else { - // If we are not waiting for a particular counter then encode - // it as -1 which means "don't care." - CntVal[T] = -1; - } + // Early-out if no wait is indicated. + if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) { + bool Modified = false; + if (OldWaitcntInstr) { + if (TrackedWaitcntSet.count(OldWaitcntInstr)) { + TrackedWaitcntSet.erase(OldWaitcntInstr); + OldWaitcntInstr->eraseFromParent(); + Modified = true; + } else { + int64_t Imm = OldWaitcntInstr->getOperand(0).getImm(); + ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm)); } + Modified = true; } + return Modified; + } - // If we are not waiting on any counter we can skip the wait altogether. - if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) { - MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt(); - int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm(); - if (!OldWaitcnt || - (AMDGPU::decodeVmcnt(IV, Imm) != - (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) || - (AMDGPU::decodeExpcnt(IV, Imm) != - (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) || - (AMDGPU::decodeLgkmcnt(IV, Imm) != - (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) { - MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent()); - if (ContainingLoop) { - MachineBasicBlock *TBB = ContainingLoop->getHeader(); - BlockWaitcntBrackets *ScoreBracket = - BlockWaitcntBracketsMap[TBB].get(); - if (!ScoreBracket) { - assert(!BlockVisitedSet.count(TBB)); - BlockWaitcntBracketsMap[TBB] = - llvm::make_unique<BlockWaitcntBrackets>(ST); - ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); - } - ScoreBracket->setRevisitLoop(true); - LLVM_DEBUG(dbgs() - << "set-revisit2: Block" - << ContainingLoop->getHeader()->getNumber() << '\n';); - } - } + if (ForceEmitZeroWaitcnts) + Wait = AMDGPU::Waitcnt::allZero(); - // Update an existing waitcount, or make a new one. - unsigned Enc = AMDGPU::encodeWaitcnt(IV, - ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT], - ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT], - ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]); - // We don't remove waitcnts that existed prior to the waitcnt - // pass. Check if the waitcnt to-be-inserted can be avoided - // or if the prev waitcnt can be updated. - bool insertSWaitInst = true; - for (MachineBasicBlock::iterator I = MI.getIterator(), - B = MI.getParent()->begin(); - insertSWaitInst && I != B; --I) { - if (I == MI.getIterator()) - continue; + if (ForceEmitWaitcnt[VM_CNT]) + Wait.VmCnt = 0; + if (ForceEmitWaitcnt[EXP_CNT]) + Wait.ExpCnt = 0; + if (ForceEmitWaitcnt[LGKM_CNT]) + Wait.LgkmCnt = 0; - switch (I->getOpcode()) { - case AMDGPU::S_WAITCNT: - if (isWaitcntStronger(I->getOperand(0).getImm(), Enc)) - insertSWaitInst = false; - else if (!OldWaitcnt) { - OldWaitcnt = &*I; - Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc); - } - break; - // TODO: skip over instructions which never require wait. - } - break; - } - if (insertSWaitInst) { - if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) { - if (ForceEmitZeroWaitcnts) - LLVM_DEBUG( - dbgs() - << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n"); - if (IsForceEmitWaitcnt) - LLVM_DEBUG(dbgs() - << "Force emit a s_waitcnt due to debug counter\n"); + ScoreBrackets.applyWaitcnt(Wait); - OldWaitcnt->getOperand(0).setImm(Enc); - if (!OldWaitcnt->getParent()) - MI.getParent()->insert(MI, OldWaitcnt); + AMDGPU::Waitcnt OldWait; + if (OldWaitcntInstr) { + OldWait = + AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm()); + } + if (OldWait.dominates(Wait)) + return false; - LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n" - << "Old Instr: " << MI << '\n' - << "New Instr: " << *OldWaitcnt << '\n'); - } else { - auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(), - MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm(Enc); - TrackedWaitcntSet.insert(SWaitInst); + if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr)) + Wait = Wait.combined(OldWait); - LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n" - << "Old Instr: " << MI << '\n' - << "New Instr: " << *SWaitInst << '\n'); - } - } + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + if (OldWaitcntInstr) { + OldWaitcntInstr->getOperand(0).setImm(Enc); - if (CntVal[EXP_CNT] == 0) { - ScoreBrackets->setMixedExpTypes(false); - } - } - } -} + LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n" + << "Old Instr: " << MI << '\n' + << "New Instr: " << *OldWaitcntInstr << '\n'); + } else { + auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(), + MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(Enc); + TrackedWaitcntSet.insert(SWaitInst); -void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB, - MachineInstr *Waitcnt) { - if (MBB.empty()) { - MBB.push_back(Waitcnt); - return; + LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n" + << "Old Instr: " << MI << '\n' + << "New Instr: " << *SWaitInst << '\n'); } - MachineBasicBlock::iterator It = MBB.end(); - MachineInstr *MI = &*(--It); - if (MI->isBranch()) { - MBB.insert(It, Waitcnt); - } else { - MBB.push_back(Waitcnt); - } + return true; } // This is a flat memory operation. Check to see if it has memory @@ -1305,15 +1079,15 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { for (const MachineMemOperand *Memop : MI.memoperands()) { unsigned AS = Memop->getAddrSpace(); - if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) + if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) return true; } return false; } -void SIInsertWaitcnts::updateEventWaitcntAfter( - MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) { +void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, + WaitcntBrackets *ScoreBrackets) { // Now look at the instruction opcode. If it is a memory access // instruction, update the upper-bound of the appropriate counter's // bracket and the destination operand scores. @@ -1379,342 +1153,124 @@ void SIInsertWaitcnts::updateEventWaitcntAfter( } } -// Merge the score brackets of the Block's predecessors; -// this merged score bracket is used when adding waitcnts to the Block -void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) { - BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get(); - int32_t MaxPending[NUM_INST_CNTS] = {0}; - int32_t MaxFlat[NUM_INST_CNTS] = {0}; - bool MixedExpTypes = false; - - // For single basic block loops, we need to retain the Block's - // score bracket to have accurate Pred info. So, make a copy of Block's - // score bracket, clear() it (which retains several important bits of info), - // populate, and then replace en masse. For non-single basic block loops, - // just clear Block's current score bracket and repopulate in-place. - bool IsSelfPred; - std::unique_ptr<BlockWaitcntBrackets> S; - - IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block)) - != Block.pred_end(); - if (IsSelfPred) { - S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets); - ScoreBrackets = S.get(); - } - - ScoreBrackets->clear(); - - // See if there are any uninitialized predecessors. If so, emit an - // s_waitcnt 0 at the beginning of the block. - for (MachineBasicBlock *Pred : Block.predecessors()) { - BlockWaitcntBrackets *PredScoreBrackets = - BlockWaitcntBracketsMap[Pred].get(); - bool Visited = BlockVisitedSet.count(Pred); - if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { - continue; - } - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - int span = - PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T); - MaxPending[T] = std::max(MaxPending[T], span); - span = - PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T); - MaxFlat[T] = std::max(MaxFlat[T], span); - } - - MixedExpTypes |= PredScoreBrackets->mixedExpTypes(); - } - - // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()? - // Also handle kills for exit block. - if (Block.succ_empty() && !KillWaitBrackets.empty()) { - for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) { - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - int Span = KillWaitBrackets[I]->getScoreUB(T) - - KillWaitBrackets[I]->getScoreLB(T); - MaxPending[T] = std::max(MaxPending[T], Span); - Span = KillWaitBrackets[I]->pendingFlat(T) - - KillWaitBrackets[I]->getScoreLB(T); - MaxFlat[T] = std::max(MaxFlat[T], Span); - } - - MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes(); - } - } - - // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK. - for (MachineBasicBlock *Pred : Block.predecessors()) { - BlockWaitcntBrackets *PredScoreBrackets = - BlockWaitcntBracketsMap[Pred].get(); - bool Visited = BlockVisitedSet.count(Pred); - if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { - continue; - } - - int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) - - PredScoreBrackets->getScoreLB(EXP_CNT); - MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan); - int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) - - PredScoreBrackets->getScoreLB(EXP_CNT); - MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan); - } - - // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()? - if (Block.succ_empty() && !KillWaitBrackets.empty()) { - for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) { - int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) - - KillWaitBrackets[I]->getScoreLB(EXP_CNT); - MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan); - int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) - - KillWaitBrackets[I]->getScoreLB(EXP_CNT); - MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan); - } - } - -#if 0 - // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker. - // TODO: how does LC distinguish between function entry and main entry? - // If this is the entry to a function, force a wait. - MachineBasicBlock &Entry = Block.getParent()->front(); - if (Entry.getNumber() == Block.getNumber()) { - ScoreBrackets->setWaitAtBeginning(); - return; - } -#endif - - // Now set the current Block's brackets to the largest ending bracket. - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - ScoreBrackets->setScoreUB(T, MaxPending[T]); - ScoreBrackets->setScoreLB(T, 0); - ScoreBrackets->setLastFlat(T, MaxFlat[T]); - } - - ScoreBrackets->setMixedExpTypes(MixedExpTypes); - - // Set the register scoreboard. - for (MachineBasicBlock *Pred : Block.predecessors()) { - if (!BlockVisitedSet.count(Pred)) { - continue; - } +bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score, + uint32_t OtherScore) { + uint32_t MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift; + uint32_t OtherShifted = + OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift; + Score = std::max(MyShifted, OtherShifted); + return OtherShifted > MyShifted; +} - BlockWaitcntBrackets *PredScoreBrackets = - BlockWaitcntBracketsMap[Pred].get(); +/// Merge the pending events and associater score brackets of \p Other into +/// this brackets status. +/// +/// Returns whether the merge resulted in a change that requires tighter waits +/// (i.e. the merged brackets strictly dominate the original brackets). +bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { + bool StrictDom = false; - // Now merge the gpr_reg_score information - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - int PredLB = PredScoreBrackets->getScoreLB(T); - int PredUB = PredScoreBrackets->getScoreUB(T); - if (PredLB < PredUB) { - int PredScale = MaxPending[T] - PredUB; - // Merge vgpr scores. - for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) { - int PredRegScore = PredScoreBrackets->getRegScore(J, T); - if (PredRegScore <= PredLB) - continue; - int NewRegScore = PredScale + PredRegScore; - ScoreBrackets->setRegScore( - J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore)); - } - // Also need to merge sgpr scores for lgkm_cnt. - if (T == LGKM_CNT) { - for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) { - int PredRegScore = - PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); - if (PredRegScore <= PredLB) - continue; - int NewRegScore = PredScale + PredRegScore; - ScoreBrackets->setRegScore( - J + NUM_ALL_VGPRS, LGKM_CNT, - std::max( - ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT), - NewRegScore)); - } - } - } - } + for (auto T : inst_counter_types()) { + // Merge event flags for this counter + const bool OldOutOfOrder = counterOutOfOrder(T); + const uint32_t OldEvents = PendingEvents & WaitEventMaskForInst[T]; + const uint32_t OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; + if (OtherEvents & ~OldEvents) + StrictDom = true; + if (Other.MixedPendingEvents[T] || + (OldEvents && OtherEvents && OldEvents != OtherEvents)) + MixedPendingEvents[T] = true; + PendingEvents |= OtherEvents; - // Also merge the WaitEvent information. - ForAllWaitEventType(W) { - enum InstCounterType T = PredScoreBrackets->eventCounter(W); - int PredEventUB = PredScoreBrackets->getEventUB(W); - if (PredEventUB > PredScoreBrackets->getScoreLB(T)) { - int NewEventUB = - MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T); - if (NewEventUB > 0) { - ScoreBrackets->setEventUB( - W, std::max(ScoreBrackets->getEventUB(W), NewEventUB)); - } - } - } - } + // Merge scores for this counter + const uint32_t MyPending = ScoreUBs[T] - ScoreLBs[T]; + const uint32_t OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T]; + MergeInfo M; + M.OldLB = ScoreLBs[T]; + M.OtherLB = Other.ScoreLBs[T]; + M.MyShift = OtherPending > MyPending ? OtherPending - MyPending : 0; + M.OtherShift = ScoreUBs[T] - Other.ScoreUBs[T] + M.MyShift; - // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()? - // Set the register scoreboard. - if (Block.succ_empty() && !KillWaitBrackets.empty()) { - for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) { - // Now merge the gpr_reg_score information. - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - int PredLB = KillWaitBrackets[I]->getScoreLB(T); - int PredUB = KillWaitBrackets[I]->getScoreUB(T); - if (PredLB < PredUB) { - int PredScale = MaxPending[T] - PredUB; - // Merge vgpr scores. - for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) { - int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T); - if (PredRegScore <= PredLB) - continue; - int NewRegScore = PredScale + PredRegScore; - ScoreBrackets->setRegScore( - J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore)); - } - // Also need to merge sgpr scores for lgkm_cnt. - if (T == LGKM_CNT) { - for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) { - int PredRegScore = - KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT); - if (PredRegScore <= PredLB) - continue; - int NewRegScore = PredScale + PredRegScore; - ScoreBrackets->setRegScore( - J + NUM_ALL_VGPRS, LGKM_CNT, - std::max( - ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT), - NewRegScore)); - } - } - } - } + const uint32_t NewUB = ScoreUBs[T] + M.MyShift; + if (NewUB < ScoreUBs[T]) + report_fatal_error("waitcnt score overflow"); + ScoreUBs[T] = NewUB; + ScoreLBs[T] = std::min(M.OldLB + M.MyShift, M.OtherLB + M.OtherShift); - // Also merge the WaitEvent information. - ForAllWaitEventType(W) { - enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W); - int PredEventUB = KillWaitBrackets[I]->getEventUB(W); - if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) { - int NewEventUB = - MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T); - if (NewEventUB > 0) { - ScoreBrackets->setEventUB( - W, std::max(ScoreBrackets->getEventUB(W), NewEventUB)); - } - } - } - } - } + StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]); - // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the - // sequencing predecessors, because changes to EXEC require waitcnts due to - // the delayed nature of these operations. - for (MachineBasicBlock *Pred : Block.predecessors()) { - if (!BlockVisitedSet.count(Pred)) { - continue; + bool RegStrictDom = false; + for (int J = 0, E = std::max(getMaxVGPR(), Other.getMaxVGPR()) + 1; J != E; + J++) { + RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]); } - BlockWaitcntBrackets *PredScoreBrackets = - BlockWaitcntBracketsMap[Pred].get(); - - int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK); - if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) { - int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub - - PredScoreBrackets->getScoreUB(EXP_CNT); - if (new_gds_ub > 0) { - ScoreBrackets->setEventUB( - GDS_GPR_LOCK, - std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub)); - } - } - int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK); - if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) { - int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub - - PredScoreBrackets->getScoreUB(EXP_CNT); - if (new_exp_ub > 0) { - ScoreBrackets->setEventUB( - EXP_GPR_LOCK, - std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub)); + if (T == LGKM_CNT) { + for (int J = 0, E = std::max(getMaxSGPR(), Other.getMaxSGPR()) + 1; + J != E; J++) { + RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]); } } - } - // if a single block loop, update the score brackets. Not needed for other - // blocks, as we did this in-place - if (IsSelfPred) { - BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets); + if (RegStrictDom && !OldOutOfOrder) + StrictDom = true; } -} -/// Return true if the given basic block is a "bottom" block of a loop. -/// This works even if the loop is discontiguous. This also handles -/// multiple back-edges for the same "header" block of a loop. -bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop, - const MachineBasicBlock *Block) { - for (MachineBasicBlock *MBB : Loop->blocks()) { - if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) { - return true; - } - } - return false; -} + VgprUB = std::max(getMaxVGPR(), Other.getMaxVGPR()); + SgprUB = std::max(getMaxSGPR(), Other.getMaxSGPR()); -/// Count the number of "bottom" basic blocks of a loop. -unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) { - unsigned Count = 0; - for (MachineBasicBlock *MBB : Loop->blocks()) { - if (MBB->isSuccessor(Loop->getHeader())) { - Count++; - } - } - return Count; + return StrictDom; } // Generate s_waitcnt instructions where needed. -void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, - MachineBasicBlock &Block) { - // Initialize the state information. - mergeInputScoreBrackets(Block); - - BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get(); +bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, + MachineBasicBlock &Block, + WaitcntBrackets &ScoreBrackets) { + bool Modified = false; LLVM_DEBUG({ dbgs() << "*** Block" << Block.getNumber() << " ***"; - ScoreBrackets->dump(); + ScoreBrackets.dump(); }); // Walk over the instructions. + MachineInstr *OldWaitcntInstr = nullptr; + for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end(); Iter != E;) { MachineInstr &Inst = *Iter; + // Remove any previously existing waitcnts. if (Inst.getOpcode() == AMDGPU::S_WAITCNT) { - // Leave pre-existing waitcnts, but note their existence via setWaitcnt. - // Remove the waitcnt-pass-generated waitcnts; the pass will add them back - // as needed. - if (!TrackedWaitcntSet.count(&Inst)) - ++Iter; - else { - ++Iter; - Inst.removeFromParent(); + if (OldWaitcntInstr) { + if (TrackedWaitcntSet.count(OldWaitcntInstr)) { + TrackedWaitcntSet.erase(OldWaitcntInstr); + OldWaitcntInstr->eraseFromParent(); + OldWaitcntInstr = nullptr; + } else if (!TrackedWaitcntSet.count(&Inst)) { + // Two successive s_waitcnt's, both of which are pre-existing and + // are therefore preserved. + int64_t Imm = OldWaitcntInstr->getOperand(0).getImm(); + ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm)); + } else { + ++Iter; + Inst.eraseFromParent(); + Modified = true; + continue; + } } - ScoreBrackets->setWaitcnt(&Inst); - continue; - } - // Kill instructions generate a conditional branch to the endmain block. - // Merge the current waitcnt state into the endmain block information. - // TODO: Are there other flavors of KILL instruction? - if (Inst.getOpcode() == AMDGPU::KILL) { - addKillWaitBracket(ScoreBrackets); + OldWaitcntInstr = &Inst; + ++Iter; + continue; } bool VCCZBugWorkAround = false; if (readsVCCZ(Inst) && (!VCCZBugHandledSet.count(&Inst))) { - if (ScoreBrackets->getScoreLB(LGKM_CNT) < - ScoreBrackets->getScoreUB(LGKM_CNT) && - ScoreBrackets->hasPendingSMEM()) { + if (ScoreBrackets.getScoreLB(LGKM_CNT) < + ScoreBrackets.getScoreUB(LGKM_CNT) && + ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) { if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) VCCZBugWorkAround = true; } @@ -1722,9 +1278,10 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, // Generate an s_waitcnt instruction to be placed before // cur_Inst, if needed. - generateWaitcntInstBefore(Inst, ScoreBrackets); + Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr); + OldWaitcntInstr = nullptr; - updateEventWaitcntAfter(Inst, ScoreBrackets); + updateEventWaitcntAfter(Inst, &ScoreBrackets); #if 0 // TODO: implement resource type check controlled by options with ub = LB. // If this instruction generates a S_SETVSKIP because it is an @@ -1737,11 +1294,9 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } #endif - ScoreBrackets->clearWaitcnt(); - LLVM_DEBUG({ Inst.print(dbgs()); - ScoreBrackets->dump(); + ScoreBrackets.dump(); }); // Check to see if this is a GWS instruction. If so, and if this is CI or @@ -1753,10 +1308,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P || Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) { // TODO: && context->target_info->GwsRequiresMemViolTest() ) { - ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); - ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - ScoreBrackets->updateByWait(LGKM_CNT, - ScoreBrackets->getScoreUB(LGKM_CNT)); + ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZero()); } // TODO: Remove this work-around after fixing the scheduler and enable the @@ -1769,71 +1321,13 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, AMDGPU::VCC) .addReg(AMDGPU::VCC); VCCZBugHandledSet.insert(&Inst); + Modified = true; } ++Iter; } - // Check if we need to force convergence at loop footer. - MachineLoop *ContainingLoop = MLI->getLoopFor(&Block); - if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) { - LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); - WaitcntData->print(); - LLVM_DEBUG(dbgs() << '\n';); - - // The iterative waitcnt insertion algorithm aims for optimal waitcnt - // placement, but doesn't guarantee convergence for a loop. Each - // loop should take at most (n+1) iterations for it to converge naturally, - // where n is the number of bottom blocks. If this threshold is reached and - // the result hasn't converged, then we force convergence by inserting - // a s_waitcnt at the end of loop footer. - if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) { - // To ensure convergence, need to make wait events at loop footer be no - // more than those from the previous iteration. - // As a simplification, instead of tracking individual scores and - // generating the precise wait count, just wait on 0. - bool HasPending = false; - MachineInstr *SWaitInst = WaitcntData->getWaitcnt(); - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { - ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); - HasPending = true; - break; - } - } - - if (HasPending) { - if (!SWaitInst) { - SWaitInst = BuildMI(Block, Block.getFirstNonPHI(), - DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm(0); - TrackedWaitcntSet.insert(SWaitInst); -#if 0 // TODO: Format the debug output - OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context); - OutputTransformAdd(SWaitInst, context); -#endif - } -#if 0 // TODO: ?? - _DEV( REPORTED_STATS->force_waitcnt_converge = 1; ) -#endif - } - - if (SWaitInst) { - LLVM_DEBUG({ - SWaitInst->print(dbgs()); - dbgs() << "\nAdjusted score board:"; - ScoreBrackets->dump(); - }); - - // Add this waitcnt to the block. It is either newly created or - // created in previous iterations and added back since block traversal - // always removes waitcnts. - insertWaitcntBeforeCF(Block, SWaitInst); - WaitcntData->setWaitcnt(SWaitInst); - } - } - } + return Modified; } bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { @@ -1841,14 +1335,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); - MLI = &getAnalysis<MachineLoopInfo>(); - IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits()); + IV = AMDGPU::getIsaVersion(ST->getCPU()); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - AMDGPUASI = ST->getAMDGPUAS(); ForceEmitZeroWaitcnts = ForceEmitZeroFlag; - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) + for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV); @@ -1868,93 +1359,70 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1; TrackedWaitcntSet.clear(); - BlockVisitedSet.clear(); VCCZBugHandledSet.clear(); - LoopWaitcntDataMap.clear(); - BlockWaitcntProcessedSet.clear(); + RpotIdxMap.clear(); + BlockInfos.clear(); - // Walk over the blocks in reverse post-dominator order, inserting - // s_waitcnt where needed. - ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); - bool Modified = false; - for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator - I = RPOT.begin(), - E = RPOT.end(), J = RPOT.begin(); - I != E;) { - MachineBasicBlock &MBB = **I; + // Keep iterating over the blocks in reverse post order, inserting and + // updating s_waitcnt where needed, until a fix point is reached. + for (MachineBasicBlock *MBB : + ReversePostOrderTraversal<MachineFunction *>(&MF)) { + RpotIdxMap[MBB] = BlockInfos.size(); + BlockInfos.emplace_back(MBB); + } - BlockVisitedSet.insert(&MBB); + std::unique_ptr<WaitcntBrackets> Brackets; + bool Modified = false; + bool Repeat; + do { + Repeat = false; - BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get(); - if (!ScoreBrackets) { - BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST); - ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get(); - } - ScoreBrackets->setPostOrder(MBB.getNumber()); - MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB); - if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr) - LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>(); + for (BlockInfo &BI : BlockInfos) { + if (!BI.Dirty) + continue; - // If we are walking into the block from before the loop, then guarantee - // at least 1 re-walk over the loop to propagate the information, even if - // no S_WAITCNT instructions were generated. - if (ContainingLoop && ContainingLoop->getHeader() == &MBB) { - unsigned Count = countNumBottomBlocks(ContainingLoop); + unsigned Idx = std::distance(&*BlockInfos.begin(), &BI); - // If the loop has multiple back-edges, and so more than one "bottom" - // basic block, we have to guarantee a re-walk over every blocks. - if ((std::count(BlockWaitcntProcessedSet.begin(), - BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) { - BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true); - LLVM_DEBUG(dbgs() << "set-revisit1: Block" - << ContainingLoop->getHeader()->getNumber() << '\n';); + if (BI.Incoming) { + if (!Brackets) + Brackets = llvm::make_unique<WaitcntBrackets>(*BI.Incoming); + else + *Brackets = *BI.Incoming; + } else { + if (!Brackets) + Brackets = llvm::make_unique<WaitcntBrackets>(ST); + else + Brackets->clear(); } - } - // Walk over the instructions. - insertWaitcntInBlock(MF, MBB); + Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets); + BI.Dirty = false; - // Record that waitcnts have been processed at least once for this block. - BlockWaitcntProcessedSet.push_back(&MBB); - - // See if we want to revisit the loop. If a loop has multiple back-edges, - // we shouldn't revisit the same "bottom" basic block. - if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) && - std::count(BlockWaitcntProcessedSet.begin(), - BlockWaitcntProcessedSet.end(), &MBB) == 1) { - MachineBasicBlock *EntryBB = ContainingLoop->getHeader(); - BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get(); - if (EntrySB && EntrySB->getRevisitLoop()) { - EntrySB->setRevisitLoop(false); - J = I; - int32_t PostOrder = EntrySB->getPostOrder(); - // TODO: Avoid this loop. Find another way to set I. - for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator - X = RPOT.begin(), - Y = RPOT.end(); - X != Y; ++X) { - MachineBasicBlock &MBBX = **X; - if (MBBX.getNumber() == PostOrder) { - I = X; - break; + if (Brackets->hasPending()) { + BlockInfo *MoveBracketsToSucc = nullptr; + for (MachineBasicBlock *Succ : BI.MBB->successors()) { + unsigned SuccIdx = RpotIdxMap[Succ]; + BlockInfo &SuccBI = BlockInfos[SuccIdx]; + if (!SuccBI.Incoming) { + SuccBI.Dirty = true; + if (SuccIdx <= Idx) + Repeat = true; + if (!MoveBracketsToSucc) { + MoveBracketsToSucc = &SuccBI; + } else { + SuccBI.Incoming = llvm::make_unique<WaitcntBrackets>(*Brackets); + } + } else if (SuccBI.Incoming->merge(*Brackets)) { + SuccBI.Dirty = true; + if (SuccIdx <= Idx) + Repeat = true; } } - LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); - WaitcntData->incIterCnt(); - LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';); - continue; - } else { - LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); - // Loop converged, reset iteration count. If this loop gets revisited, - // it must be from an outer loop, the counter will restart, this will - // ensure we don't force convergence on such revisits. - WaitcntData->resetIterCnt(); + if (MoveBracketsToSucc) + MoveBracketsToSucc->Incoming = std::move(Brackets); } } - - J = I; - ++I; - } + } while (Repeat); SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td index b73d30940fc38..65ffc27b8b608 100644 --- a/lib/Target/AMDGPU/SIInstrFormats.td +++ b/lib/Target/AMDGPU/SIInstrFormats.td @@ -121,6 +121,10 @@ class InstSI <dag outs, dag ins, string asm = "", // This bit indicates that this is a D16 buffer instruction. field bit D16Buf = 0; + // This bit indicates that this uses the floating point double precision + // rounding mode flags + field bit FPDPRounding = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -178,6 +182,8 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{50} = D16Buf; + let TSFlags{51} = FPDPRounding; + let SchedRW = [Write32Bit]; field bits<1> DisableSIDecoder = 0; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index f3745382a6f4b..2370d5fa7b27b 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -31,6 +31,7 @@ #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" @@ -264,9 +265,10 @@ static bool isStride64(unsigned Opc) { } } -bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, - int64_t &Offset, - const TargetRegisterInfo *TRI) const { +bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt, + MachineOperand *&BaseOp, + int64_t &Offset, + const TargetRegisterInfo *TRI) const { unsigned Opc = LdSt.getOpcode(); if (isDS(LdSt)) { @@ -274,11 +276,10 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, getNamedOperand(LdSt, AMDGPU::OpName::offset); if (OffsetImm) { // Normal, single offset LDS instruction. - const MachineOperand *AddrReg = - getNamedOperand(LdSt, AMDGPU::OpName::addr); - - BaseReg = AddrReg->getReg(); + BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); Offset = OffsetImm->getImm(); + assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " + "operands of type register."); return true; } @@ -309,10 +310,10 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, if (isStride64(Opc)) EltSize *= 64; - const MachineOperand *AddrReg = - getNamedOperand(LdSt, AMDGPU::OpName::addr); - BaseReg = AddrReg->getReg(); + BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); Offset = EltSize * Offset0; + assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " + "operands of type register."); return true; } @@ -324,19 +325,20 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, if (SOffset && SOffset->isReg()) return false; - const MachineOperand *AddrReg = - getNamedOperand(LdSt, AMDGPU::OpName::vaddr); + MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); if (!AddrReg) return false; const MachineOperand *OffsetImm = getNamedOperand(LdSt, AMDGPU::OpName::offset); - BaseReg = AddrReg->getReg(); + BaseOp = AddrReg; Offset = OffsetImm->getImm(); if (SOffset) // soffset can be an inline immediate. Offset += SOffset->getImm(); + assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " + "operands of type register."); return true; } @@ -346,36 +348,46 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, if (!OffsetImm) return false; - const MachineOperand *SBaseReg = - getNamedOperand(LdSt, AMDGPU::OpName::sbase); - BaseReg = SBaseReg->getReg(); + MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase); + BaseOp = SBaseReg; Offset = OffsetImm->getImm(); + assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " + "operands of type register."); return true; } if (isFLAT(LdSt)) { - const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); + MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); if (VAddr) { // Can't analyze 2 offsets. if (getNamedOperand(LdSt, AMDGPU::OpName::saddr)) return false; - BaseReg = VAddr->getReg(); + BaseOp = VAddr; } else { // scratch instructions have either vaddr or saddr. - BaseReg = getNamedOperand(LdSt, AMDGPU::OpName::saddr)->getReg(); + BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); } Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); + assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " + "operands of type register."); return true; } return false; } -static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1, - const MachineInstr &MI2, unsigned BaseReg2) { - if (BaseReg1 == BaseReg2) +static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, + const MachineOperand &BaseOp1, + const MachineInstr &MI2, + const MachineOperand &BaseOp2) { + // Support only base operands with base registers. + // Note: this could be extended to support FI operands. + if (!BaseOp1.isReg() || !BaseOp2.isReg()) + return false; + + if (BaseOp1.isIdenticalTo(BaseOp2)) return true; if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) @@ -401,12 +413,13 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1, return Base1 == Base2; } -bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, - unsigned BaseReg1, - MachineInstr &SecondLdSt, - unsigned BaseReg2, +bool SIInstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1, + MachineOperand &BaseOp2, unsigned NumLoads) const { - if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2)) + MachineInstr &FirstLdSt = *BaseOp1.getParent(); + MachineInstr &SecondLdSt = *BaseOp2.getParent(); + + if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2)) return false; const MachineOperand *FirstDst = nullptr; @@ -863,7 +876,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - DebugLoc DL = MBB.findDebugLoc(MI); + const DebugLoc &DL = MBB.findDebugLoc(MI); unsigned Size = FrameInfo.getObjectSize(FrameIndex); unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); @@ -907,16 +920,6 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, return; } - if (!ST.isVGPRSpillingEnabled(MF->getFunction())) { - LLVMContext &Ctx = MF->getFunction().getContext(); - Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" - " spill register"); - BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) - .addReg(SrcReg); - - return; - } - assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize); @@ -972,9 +975,9 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); - const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); - DebugLoc DL = MBB.findDebugLoc(MI); + const DebugLoc &DL = MBB.findDebugLoc(MI); unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); unsigned Size = FrameInfo.getObjectSize(FrameIndex); unsigned SpillSize = TRI->getSpillSize(*RC); @@ -986,6 +989,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, PtrInfo, MachineMemOperand::MOLoad, Size, Align); if (RI.isSGPRClass(RC)) { + MFI->setHasSpilledSGPRs(); + // FIXME: Maybe this should not include a memoperand because it will be // lowered to non-memory instructions. const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); @@ -1009,15 +1014,6 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, return; } - if (!ST.isVGPRSpillingEnabled(MF->getFunction())) { - LLVMContext &Ctx = MF->getFunction().getContext(); - Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" - " restore register"); - BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); - - return; - } - assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize); @@ -1036,7 +1032,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); - DebugLoc DL = MBB.findDebugLoc(MI); + const DebugLoc &DL = MBB.findDebugLoc(MI); unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); unsigned WavefrontSize = ST.getWavefrontSize(); @@ -1044,7 +1040,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( if (!MFI->hasCalculatedTID()) { MachineBasicBlock &Entry = MBB.getParent()->front(); MachineBasicBlock::iterator Insert = Entry.front(); - DebugLoc DL = Insert->getDebugLoc(); + const DebugLoc &DL = Insert->getDebugLoc(); TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass, *MF); @@ -1421,10 +1417,15 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, // TargetInstrInfo::commuteInstruction uses it. bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const { - if (!MI.isCommutable()) + return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); +} + +bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0, + unsigned &SrcOpIdx1) const { + if (!Desc.isCommutable()) return false; - unsigned Opc = MI.getOpcode(); + unsigned Opc = Desc.getOpcode(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); if (Src0Idx == -1) return false; @@ -1549,8 +1550,9 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, // buzz; RS->enterBasicBlockEnd(MBB); - unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass, - MachineBasicBlock::iterator(GetPC), 0); + unsigned Scav = RS->scavengeRegisterBackwards( + AMDGPU::SReg_64RegClass, + MachineBasicBlock::iterator(GetPC), false, 0); MRI.replaceRegWith(PCReg, Scav); MRI.clearVirtRegs(); RS->setRegUsed(Scav); @@ -1644,7 +1646,34 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const { MachineBasicBlock::iterator I = MBB.getFirstTerminator(); - if (I == MBB.end()) + auto E = MBB.end(); + if (I == E) + return false; + + // Skip over the instructions that are artificially terminators for special + // exec management. + while (I != E && !I->isBranch() && !I->isReturn() && + I->getOpcode() != AMDGPU::SI_MASK_BRANCH) { + switch (I->getOpcode()) { + case AMDGPU::SI_MASK_BRANCH: + case AMDGPU::S_MOV_B64_term: + case AMDGPU::S_XOR_B64_term: + case AMDGPU::S_ANDN2_B64_term: + break; + case AMDGPU::SI_IF: + case AMDGPU::SI_ELSE: + case AMDGPU::SI_KILL_I1_TERMINATOR: + case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: + // FIXME: It's messy that these need to be considered here at all. + return true; + default: + llvm_unreachable("unexpected non-branch terminator inst"); + } + + ++I; + } + + if (I == E) return false; if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH) @@ -1933,20 +1962,20 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { } unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( - PseudoSourceValue::PSVKind Kind) const { + unsigned Kind) const { switch(Kind) { case PseudoSourceValue::Stack: case PseudoSourceValue::FixedStack: - return ST.getAMDGPUAS().PRIVATE_ADDRESS; + return AMDGPUAS::PRIVATE_ADDRESS; case PseudoSourceValue::ConstantPool: case PseudoSourceValue::GOT: case PseudoSourceValue::JumpTable: case PseudoSourceValue::GlobalValueCallEntry: case PseudoSourceValue::ExternalSymbolCallEntry: case PseudoSourceValue::TargetCustom: - return ST.getAMDGPUAS().CONSTANT_ADDRESS; + return AMDGPUAS::CONSTANT_ADDRESS; } - return ST.getAMDGPUAS().FLAT_ADDRESS; + return AMDGPUAS::FLAT_ADDRESS; } static void removeModOperands(MachineInstr &MI) { @@ -2066,12 +2095,40 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (Src2->isReg() && Src2->getReg() == Reg) { // Not allowed to use constant bus for another operand. // We can however allow an inline immediate as src0. - if (!Src0->isImm() && - (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) - return false; + bool Src0Inlined = false; + if (Src0->isReg()) { + // Try to inline constant if possible. + // If the Def moves immediate and the use is single + // We are saving VGPR here. + MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); + if (Def && Def->isMoveImmediate() && + isInlineConstant(Def->getOperand(1)) && + MRI->hasOneUse(Src0->getReg())) { + Src0->ChangeToImmediate(Def->getOperand(1).getImm()); + Src0Inlined = true; + } else if ((RI.isPhysicalRegister(Src0->getReg()) && + RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg()))) || + (RI.isVirtualRegister(Src0->getReg()) && + RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) + return false; + // VGPR is okay as Src0 - fallthrough + } - if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) - return false; + if (Src1->isReg() && !Src0Inlined ) { + // We have one slot for inlinable constant so far - try to fill it + MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); + if (Def && Def->isMoveImmediate() && + isInlineConstant(Def->getOperand(1)) && + MRI->hasOneUse(Src1->getReg()) && + commuteInstruction(UseMI)) { + Src0->ChangeToImmediate(Def->getOperand(1).getImm()); + } else if ((RI.isPhysicalRegister(Src1->getReg()) && + RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) || + (RI.isVirtualRegister(Src1->getReg()) && + RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) + return false; + // VGPR is okay as Src1 - fallthrough + } const int64_t Imm = ImmOp->getImm(); @@ -2117,11 +2174,13 @@ static bool offsetsDoNotOverlap(int WidthA, int OffsetA, bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, MachineInstr &MIb) const { - unsigned BaseReg0, BaseReg1; + MachineOperand *BaseOp0, *BaseOp1; int64_t Offset0, Offset1; - if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && - getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { + if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) && + getMemOperandWithOffset(MIb, BaseOp1, Offset1, &RI)) { + if (!BaseOp0->isIdenticalTo(*BaseOp1)) + return false; if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { // FIXME: Handle ds_read2 / ds_write2. @@ -2129,8 +2188,7 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, } unsigned Width0 = (*MIa.memoperands_begin())->getSize(); unsigned Width1 = (*MIb.memoperands_begin())->getSize(); - if (BaseReg0 == BaseReg1 && - offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { + if (offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { return true; } } @@ -2398,8 +2456,7 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_INLINE_C_INT32: case AMDGPU::OPERAND_REG_INLINE_C_FP32: { int32_t Trunc = static_cast<int32_t>(Imm); - return Trunc == Imm && - AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); + return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); } case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_IMM_FP64: @@ -2523,6 +2580,115 @@ bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { hasModifiersSet(MI, AMDGPU::OpName::omod); } +bool SIInstrInfo::canShrink(const MachineInstr &MI, + const MachineRegisterInfo &MRI) const { + const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); + // Can't shrink instruction with three operands. + // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add + // a special case for it. It can only be shrunk if the third operand + // is vcc. We should handle this the same way we handle vopc, by addding + // a register allocation hint pre-regalloc and then do the shrinking + // post-regalloc. + if (Src2) { + switch (MI.getOpcode()) { + default: return false; + + case AMDGPU::V_ADDC_U32_e64: + case AMDGPU::V_SUBB_U32_e64: + case AMDGPU::V_SUBBREV_U32_e64: { + const MachineOperand *Src1 + = getNamedOperand(MI, AMDGPU::OpName::src1); + if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg())) + return false; + // Additional verification is needed for sdst/src2. + return true; + } + case AMDGPU::V_MAC_F32_e64: + case AMDGPU::V_MAC_F16_e64: + case AMDGPU::V_FMAC_F32_e64: + if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) || + hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) + return false; + break; + + case AMDGPU::V_CNDMASK_B32_e64: + break; + } + } + + const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) || + hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) + return false; + + // We don't need to check src0, all input types are legal, so just make sure + // src0 isn't using any modifiers. + if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) + return false; + + // Can it be shrunk to a valid 32 bit opcode? + if (!hasVALU32BitEncoding(MI.getOpcode())) + return false; + + // Check output modifiers + return !hasModifiersSet(MI, AMDGPU::OpName::omod) && + !hasModifiersSet(MI, AMDGPU::OpName::clamp); +} + +// Set VCC operand with all flags from \p Orig, except for setting it as +// implicit. +static void copyFlagsToImplicitVCC(MachineInstr &MI, + const MachineOperand &Orig) { + + for (MachineOperand &Use : MI.implicit_operands()) { + if (Use.isUse() && Use.getReg() == AMDGPU::VCC) { + Use.setIsUndef(Orig.isUndef()); + Use.setIsKill(Orig.isKill()); + return; + } + } +} + +MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, + unsigned Op32) const { + MachineBasicBlock *MBB = MI.getParent();; + MachineInstrBuilder Inst32 = + BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)); + + // Add the dst operand if the 32-bit encoding also has an explicit $vdst. + // For VOPC instructions, this is replaced by an implicit def of vcc. + int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); + if (Op32DstIdx != -1) { + // dst + Inst32.add(MI.getOperand(0)); + } else { + assert(MI.getOperand(0).getReg() == AMDGPU::VCC && + "Unexpected case"); + } + + Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0)); + + const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src1) + Inst32.add(*Src1); + + const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); + + if (Src2) { + int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); + if (Op32Src2Idx != -1) { + Inst32.add(*Src2); + } else { + // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is + // replaced with an implicit read of vcc. This was already added + // during the initial BuildMI, so find it to preserve the flags. + copyFlagsToImplicitVCC(*Inst32, *Src2); + } + } + + return Inst32; +} + bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const { @@ -2806,6 +2972,42 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + // Verify MIMG + if (isMIMG(MI.getOpcode()) && !MI.mayStore()) { + // Ensure that the return type used is large enough for all the options + // being used TFE/LWE require an extra result register. + const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask); + if (DMask) { + uint64_t DMaskImm = DMask->getImm(); + uint32_t RegCount = + isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm); + const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe); + const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe); + const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16); + + // Adjust for packed 16 bit values + if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem()) + RegCount >>= 1; + + // Adjust if using LWE or TFE + if ((LWE && LWE->getImm()) || (TFE && TFE->getImm())) + RegCount += 1; + + const uint32_t DstIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); + const MachineOperand &Dst = MI.getOperand(DstIdx); + if (Dst.isReg()) { + const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx); + uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32; + if (RegCount > DstSize) { + ErrInfo = "MIMG instruction returns too many registers for dst " + "register class"; + return false; + } + } + } + } + // Verify VOP*. Ignore multiple sgpr operands on writelane. if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32 && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) { @@ -3001,6 +3203,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64; case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64; + case AMDGPU::S_XNOR_B32: + return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64; case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64; case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64; @@ -3438,8 +3642,13 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, // pointer value is uniform. MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { - unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); - SBase->setReg(SGPR); + unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); + SBase->setReg(SGPR); + } + MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff); + if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) { + unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI); + SOff->setReg(SGPR); } } @@ -3475,7 +3684,191 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, FoldImmediate(*Copy, *Def, OpReg, &MRI); } -void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { +// Emit the actual waterfall loop, executing the wrapped instruction for each +// unique value of \p Rsrc across all lanes. In the best case we execute 1 +// iteration, in the worst case we execute 64 (once per lane). +static void +emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, + MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, + const DebugLoc &DL, MachineOperand &Rsrc) { + MachineBasicBlock::iterator I = LoopBB.begin(); + + unsigned VRsrc = Rsrc.getReg(); + unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef()); + + unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned CondReg0 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned CondReg1 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned AndCond = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + + // Beginning of the loop, read the next Rsrc variant. + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3) + .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3); + + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc) + .addReg(SRsrcSub0) + .addImm(AMDGPU::sub0) + .addReg(SRsrcSub1) + .addImm(AMDGPU::sub1) + .addReg(SRsrcSub2) + .addImm(AMDGPU::sub2) + .addReg(SRsrcSub3) + .addImm(AMDGPU::sub3); + + // Update Rsrc operand to use the SGPR Rsrc. + Rsrc.setReg(SRsrc); + Rsrc.setIsKill(true); + + // Identify all lanes with identical Rsrc operands in their VGPRs. + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0) + .addReg(SRsrc, 0, AMDGPU::sub0_sub1) + .addReg(VRsrc, 0, AMDGPU::sub0_sub1); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1) + .addReg(SRsrc, 0, AMDGPU::sub2_sub3) + .addReg(VRsrc, 0, AMDGPU::sub2_sub3); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_B64), AndCond) + .addReg(CondReg0) + .addReg(CondReg1); + + MRI.setSimpleHint(SaveExec, AndCond); + + // Update EXEC to matching lanes, saving original to SaveExec. + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExec) + .addReg(AndCond, RegState::Kill); + + // The original instruction is here; we insert the terminators after it. + I = LoopBB.end(); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1. + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(SaveExec); + BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB); +} + +// Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register +// with SGPRs by iterating over all unique values across all lanes. +static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, + MachineOperand &Rsrc, MachineDominatorTree *MDT) { + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineBasicBlock::iterator I(&MI); + const DebugLoc &DL = MI.getDebugLoc(); + + unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + // Save the EXEC mask + BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B64), SaveExec) + .addReg(AMDGPU::EXEC); + + // Killed uses in the instruction we are waterfalling around will be + // incorrect due to the added control-flow. + for (auto &MO : MI.uses()) { + if (MO.isReg() && MO.isUse()) { + MRI.clearKillFlags(MO.getReg()); + } + } + + // To insert the loop we need to split the block. Move everything after this + // point to a new block, and insert a new empty block between the two. + MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + + MF.insert(MBBI, LoopBB); + MF.insert(MBBI, RemainderBB); + + LoopBB->addSuccessor(LoopBB); + LoopBB->addSuccessor(RemainderBB); + + // Move MI to the LoopBB, and the remainder of the block to RemainderBB. + MachineBasicBlock::iterator J = I++; + RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); + RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + LoopBB->splice(LoopBB->begin(), &MBB, J); + + MBB.addSuccessor(LoopBB); + + // Update dominators. We know that MBB immediately dominates LoopBB, that + // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately + // dominates all of the successors transferred to it from MBB that MBB used + // to dominate. + if (MDT) { + MDT->addNewBlock(LoopBB, &MBB); + MDT->addNewBlock(RemainderBB, LoopBB); + for (auto &Succ : RemainderBB->successors()) { + if (MDT->dominates(&MBB, Succ)) { + MDT->changeImmediateDominator(Succ, RemainderBB); + } + } + } + + emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc); + + // Restore the EXEC mask + MachineBasicBlock::iterator First = RemainderBB->begin(); + BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addReg(SaveExec); +} + +// Extract pointer from Rsrc and return a zero-value Rsrc replacement. +static std::tuple<unsigned, unsigned> +extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) { + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Extract the ptr from the resource descriptor. + unsigned RsrcPtr = + TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass, + AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); + + // Create an empty resource descriptor + unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); + uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat(); + + // Zero64 = 0 + BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64) + .addImm(0); + + // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} + BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo) + .addImm(RsrcDataFormat & 0xFFFFFFFF); + + // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} + BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi) + .addImm(RsrcDataFormat >> 32); + + // NewSRsrc = {Zero64, SRsrcFormat} + BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc) + .addReg(Zero64) + .addImm(AMDGPU::sub0_sub1) + .addReg(SRsrcFormatLo) + .addImm(AMDGPU::sub2) + .addReg(SRsrcFormatHi) + .addImm(AMDGPU::sub3); + + return std::make_tuple(RsrcPtr, NewSRsrc); +} + +void SIInstrInfo::legalizeOperands(MachineInstr &MI, + MachineDominatorTree *MDT) const { MachineFunction &MF = *MI.getParent()->getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -3617,75 +4010,56 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { return; } - // Legalize MUBUF* instructions by converting to addr64 form. - // FIXME: If we start using the non-addr64 instructions for compute, we - // may need to legalize them as above. This especially applies to the - // buffer_load_format_* variants and variants with idxen (or bothen). - int SRsrcIdx = + // Legalize MUBUF* instructions. + int RsrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); - if (SRsrcIdx != -1) { + if (RsrcIdx != -1) { // We have an MUBUF instruction - MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx); - unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass; - if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), - RI.getRegClass(SRsrcRC))) { + MachineOperand *Rsrc = &MI.getOperand(RsrcIdx); + unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass; + if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()), + RI.getRegClass(RsrcRC))) { // The operands are legal. // FIXME: We may need to legalize operands besided srsrc. return; } - MachineBasicBlock &MBB = *MI.getParent(); - - // Extract the ptr from the resource descriptor. - unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, - &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); - - // Create an empty resource descriptor - unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); - uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); - - // Zero64 = 0 - BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64) - .addImm(0); + // Legalize a VGPR Rsrc. + // + // If the instruction is _ADDR64, we can avoid a waterfall by extracting + // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using + // a zero-value SRsrc. + // + // If the instruction is _OFFSET (both idxen and offen disabled), and we + // support ADDR64 instructions, we can convert to ADDR64 and do the same as + // above. + // + // Otherwise we are on non-ADDR64 hardware, and/or we have + // idxen/offen/bothen and we fall back to a waterfall loop. - // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} - BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo) - .addImm(RsrcDataFormat & 0xFFFFFFFF); - - // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} - BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi) - .addImm(RsrcDataFormat >> 32); - - // NewSRsrc = {Zero64, SRsrcFormat} - BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) - .addReg(Zero64) - .addImm(AMDGPU::sub0_sub1) - .addReg(SRsrcFormatLo) - .addImm(AMDGPU::sub2) - .addReg(SRsrcFormatHi) - .addImm(AMDGPU::sub3); + MachineBasicBlock &MBB = *MI.getParent(); MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); - unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - if (VAddr) { + if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) { // This is already an ADDR64 instruction so we need to add the pointer // extracted from the resource descriptor to the current value of VAddr. unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + + unsigned RsrcPtr, NewSRsrc; + std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); - // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 + // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0 DebugLoc DL = MI.getDebugLoc(); BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) - .addReg(SRsrcPtr, 0, AMDGPU::sub0) - .addReg(VAddr->getReg(), 0, AMDGPU::sub0); + .addReg(RsrcPtr, 0, AMDGPU::sub0) + .addReg(VAddr->getReg(), 0, AMDGPU::sub0); - // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 + // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) - .addReg(SRsrcPtr, 0, AMDGPU::sub1) - .addReg(VAddr->getReg(), 0, AMDGPU::sub1); + .addReg(RsrcPtr, 0, AMDGPU::sub1) + .addReg(VAddr->getReg(), 0, AMDGPU::sub1); // NewVaddr = {NewVaddrHi, NewVaddrLo} BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) @@ -3693,13 +4067,20 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { .addImm(AMDGPU::sub0) .addReg(NewVAddrHi) .addImm(AMDGPU::sub1); - } else { + + VAddr->setReg(NewVAddr); + Rsrc->setReg(NewSRsrc); + } else if (!VAddr && ST.hasAddr64()) { // This instructions is the _OFFSET variant, so we need to convert it to // ADDR64. assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here"); + unsigned RsrcPtr, NewSRsrc; + std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc); + + unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); @@ -3715,10 +4096,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) .add(*VData) - .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. - // This will be replaced later - // with the new value of vaddr. - .add(*SRsrc) + .addReg(NewVAddr) + .addReg(NewSRsrc) .add(*SOffset) .add(*Offset); @@ -3735,21 +4114,19 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { MIB.addImm(TFE->getImm()); } - MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MIB.cloneMemRefs(MI); Addr64 = MIB; } else { // Atomics with return. Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) .add(*VData) .add(*VDataIn) - .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. - // This will be replaced later - // with the new value of vaddr. - .add(*SRsrc) + .addReg(NewVAddr) + .addReg(NewSRsrc) .add(*SOffset) .add(*Offset) .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) - .setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + .cloneMemRefs(MI); } MI.removeFromParent(); @@ -3757,23 +4134,20 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { // NewVaddr = {NewVaddrHi, NewVaddrLo} BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) - .addReg(SRsrcPtr, 0, AMDGPU::sub0) + .addReg(RsrcPtr, 0, AMDGPU::sub0) .addImm(AMDGPU::sub0) - .addReg(SRsrcPtr, 0, AMDGPU::sub1) + .addReg(RsrcPtr, 0, AMDGPU::sub1) .addImm(AMDGPU::sub1); - - VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr); - SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc); + } else { + // This is another variant; legalize Rsrc with waterfall loop from VGPRs + // to SGPRs. + loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT); } - - // Update the instruction to use NewVaddr - VAddr->setReg(NewVAddr); - // Update the instruction to use NewSRsrc - SRsrc->setReg(NewSRsrc); } } -void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { +void SIInstrInfo::moveToVALU(MachineInstr &TopInst, + MachineDominatorTree *MDT) const { SetVectorType Worklist; Worklist.insert(&TopInst); @@ -3791,34 +4165,62 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { break; case AMDGPU::S_ADD_U64_PSEUDO: case AMDGPU::S_SUB_U64_PSEUDO: - splitScalar64BitAddSub(Worklist, Inst); + splitScalar64BitAddSub(Worklist, Inst, MDT); Inst.eraseFromParent(); continue; case AMDGPU::S_ADD_I32: case AMDGPU::S_SUB_I32: // FIXME: The u32 versions currently selected use the carry. - if (moveScalarAddSub(Worklist, Inst)) + if (moveScalarAddSub(Worklist, Inst, MDT)) continue; // Default handling break; case AMDGPU::S_AND_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); Inst.eraseFromParent(); continue; case AMDGPU::S_OR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); Inst.eraseFromParent(); continue; case AMDGPU::S_XOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_NAND_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_NOR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_XNOR_B64: + if (ST.hasDLInsts()) + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); + else + splitScalar64BitXnor(Worklist, Inst, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_ANDN2_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_ORN2_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); Inst.eraseFromParent(); continue; case AMDGPU::S_NOT_B64: - splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); Inst.eraseFromParent(); continue; @@ -3899,90 +4301,31 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { Inst.eraseFromParent(); continue; - case AMDGPU::S_XNOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32); + case AMDGPU::S_NAND_B32: + splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); Inst.eraseFromParent(); continue; - case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: { - unsigned VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff); - auto Add = MRI.getUniqueVRegDef(VAddr->getReg()); - unsigned Offset = 0; - - // FIXME: This isn't safe because the addressing mode doesn't work - // correctly if vaddr is negative. - // - // FIXME: Should probably be done somewhere else, maybe SIFoldOperands. - // - // See if we can extract an immediate offset by recognizing one of these: - // V_ADD_I32_e32 dst, imm, src1 - // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1 - // V_ADD will be removed by "Remove dead machine instructions". - if (Add && - (Add->getOpcode() == AMDGPU::V_ADD_I32_e32 || - Add->getOpcode() == AMDGPU::V_ADD_U32_e64)) { - static const unsigned SrcNames[2] = { - AMDGPU::OpName::src0, - AMDGPU::OpName::src1, - }; - - // Find a literal offset in one of source operands. - for (int i = 0; i < 2; i++) { - const MachineOperand *Src = - getNamedOperand(*Add, SrcNames[i]); - - if (Src->isReg()) { - auto Mov = MRI.getUniqueVRegDef(Src->getReg()); - if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32) - Src = &Mov->getOperand(1); - } - - if (Src) { - if (Src->isImm()) - Offset = Src->getImm(); - else if (Src->isCImm()) - Offset = Src->getCImm()->getZExtValue(); - } - - if (Offset && isLegalMUBUFImmOffset(Offset)) { - VAddr = getNamedOperand(*Add, SrcNames[!i]); - break; - } - - Offset = 0; - } - } - - MachineInstr *NewInstr = - BuildMI(*MBB, Inst, Inst.getDebugLoc(), - get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst) - .add(*VAddr) // vaddr - .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc - .addImm(0) // soffset - .addImm(Offset) // offset - .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm()) - .addImm(0) // slc - .addImm(0) // tfe - .setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end()) - .getInstr(); + case AMDGPU::S_NOR_B32: + splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); + Inst.eraseFromParent(); + continue; - MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(), - VDst); - addUsersToMoveToVALUWorklist(VDst, MRI, Worklist); + case AMDGPU::S_ANDN2_B32: + splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); Inst.eraseFromParent(); + continue; - // Legalize all operands other than the offset. Notably, convert the srsrc - // into SGPRs using v_readfirstlane if needed. - legalizeOperands(*NewInstr); + case AMDGPU::S_ORN2_B32: + splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); + Inst.eraseFromParent(); continue; } - } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { // We cannot move this instruction to the VALU, so we should try to // legalize its operands instead. - legalizeOperands(Inst); + legalizeOperands(Inst, MDT); continue; } @@ -4071,7 +4414,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { } // Legalize the operands - legalizeOperands(Inst); + legalizeOperands(Inst, MDT); if (HasDst) addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); @@ -4079,8 +4422,8 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { } // Add/sub require special handling to deal with carry outs. -bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, - MachineInstr &Inst) const { +bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT) const { if (ST.hasAddNoCarry()) { // Assume there is no user of scc since we don't select this in that case. // Since scc isn't used, it doesn't really matter if the i32 or u32 variant @@ -4104,7 +4447,7 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, Inst.setDesc(get(NewOpc)); Inst.addImplicitDefUseOperands(*MBB.getParent()); MRI.replaceRegWith(OldDstReg, ResultReg); - legalizeOperands(Inst); + legalizeOperands(Inst, MDT); addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); return true; @@ -4151,23 +4494,116 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, MachineOperand &Src0 = Inst.getOperand(1); MachineOperand &Src1 = Inst.getOperand(2); - legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); - legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); - - unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); if (ST.hasDLInsts()) { + unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); + legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); + BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest) .add(Src0) .add(Src1); + + MRI.replaceRegWith(Dest.getReg(), NewDest); + addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); } else { - unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor) - .add(Src0) + // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can + // invert either source and then perform the XOR. If either source is a + // scalar register, then we can leave the inversion on the scalar unit to + // acheive a better distrubution of scalar and vector instructions. + bool Src0IsSGPR = Src0.isReg() && + RI.isSGPRClass(MRI.getRegClass(Src0.getReg())); + bool Src1IsSGPR = Src1.isReg() && + RI.isSGPRClass(MRI.getRegClass(Src1.getReg())); + MachineInstr *Not = nullptr; + MachineInstr *Xor = nullptr; + unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + // Build a pair of scalar instructions and add them to the work list. + // The next iteration over the work list will lower these to the vector + // unit as necessary. + if (Src0IsSGPR) { + Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp) + .add(Src0); + Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) + .addReg(Temp) .add(Src1); + } else if (Src1IsSGPR) { + Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp) + .add(Src1); + Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest) + .add(Src0) + .addReg(Temp); + } else { + Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp) + .add(Src0) + .add(Src1); + Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) + .addReg(Temp); + Worklist.insert(Not); + } + + MRI.replaceRegWith(Dest.getReg(), NewDest); - BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), NewDest) - .addReg(Xor); + Worklist.insert(Xor); + + addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); } +} + +void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, + MachineInstr &Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + const DebugLoc &DL = Inst.getDebugLoc(); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + + unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm) + .add(Src0) + .add(Src1); + + MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest) + .addReg(Interm); + + Worklist.insert(&Op); + Worklist.insert(&Not); + + MRI.replaceRegWith(Dest.getReg(), NewDest); + addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); +} + +void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, + MachineInstr &Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + const DebugLoc &DL = Inst.getDebugLoc(); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + + unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm) + .add(Src1); + + MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest) + .add(Src0) + .addReg(Interm); + + Worklist.insert(&Not); + Worklist.insert(&Op); MRI.replaceRegWith(Dest.getReg(), NewDest); addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); @@ -4200,13 +4636,13 @@ void SIInstrInfo::splitScalar64BitUnaryOp( const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); - BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); + MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); - BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); + MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) @@ -4217,6 +4653,9 @@ void SIInstrInfo::splitScalar64BitUnaryOp( MRI.replaceRegWith(Dest.getReg(), FullDestReg); + Worklist.insert(&LoHalf); + Worklist.insert(&HiHalf); + // We don't need to legalizeOperands here because for a single operand, src0 // will support any kind of input. @@ -4224,8 +4663,9 @@ void SIInstrInfo::splitScalar64BitUnaryOp( addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitAddSub( - SetVectorType &Worklist, MachineInstr &Inst) const { +void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, + MachineInstr &Inst, + MachineDominatorTree *MDT) const { bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); MachineBasicBlock &MBB = *Inst.getParent(); @@ -4285,16 +4725,16 @@ void SIInstrInfo::splitScalar64BitAddSub( // Try to legalize the operands in case we need to swap the order to keep it // valid. - legalizeOperands(*LoHalf); - legalizeOperands(*HiHalf); + legalizeOperands(*LoHalf, MDT); + legalizeOperands(*HiHalf, MDT); // Move all users of this moved vlaue. addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitBinaryOp( - SetVectorType &Worklist, MachineInstr &Inst, - unsigned Opcode) const { +void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, + MachineInstr &Inst, unsigned Opcode, + MachineDominatorTree *MDT) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -4321,6 +4761,10 @@ void SIInstrInfo::splitScalar64BitBinaryOp( AMDGPU::sub0, Src0SubRC); MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub1, Src0SubRC); + MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, + AMDGPU::sub1, Src1SubRC); const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); @@ -4331,11 +4775,6 @@ void SIInstrInfo::splitScalar64BitBinaryOp( .add(SrcReg0Sub0) .add(SrcReg1Sub0); - MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, - AMDGPU::sub1, Src0SubRC); - MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, - AMDGPU::sub1, Src1SubRC); - unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) .add(SrcReg0Sub1) @@ -4350,22 +4789,62 @@ void SIInstrInfo::splitScalar64BitBinaryOp( MRI.replaceRegWith(Dest.getReg(), FullDestReg); - // Try to legalize the operands in case we need to swap the order to keep it - // valid. - legalizeOperands(LoHalf); - legalizeOperands(HiHalf); + Worklist.insert(&LoHalf); + Worklist.insert(&HiHalf); // Move all users of this moved vlaue. addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } +void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, + MachineInstr &Inst, + MachineDominatorTree *MDT) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + const DebugLoc &DL = Inst.getDebugLoc(); + + MachineBasicBlock::iterator MII = Inst; + + const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); + + unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + MachineOperand* Op0; + MachineOperand* Op1; + + if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) { + Op0 = &Src0; + Op1 = &Src1; + } else { + Op0 = &Src1; + Op1 = &Src0; + } + + BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm) + .add(*Op0); + + unsigned NewDest = MRI.createVirtualRegister(DestRC); + + MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest) + .addReg(Interm) + .add(*Op1); + + MRI.replaceRegWith(Dest.getReg(), NewDest); + + Worklist.insert(&Xor); +} + void SIInstrInfo::splitScalar64BitBCNT( SetVectorType &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst.getDebugLoc(); + const DebugLoc &DL = Inst.getDebugLoc(); MachineOperand &Dest = Inst.getOperand(0); MachineOperand &Src = Inst.getOperand(1); @@ -4401,7 +4880,7 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst.getDebugLoc(); + const DebugLoc &DL = Inst.getDebugLoc(); MachineOperand &Dest = Inst.getOperand(0); uint32_t Imm = Inst.getOperand(2).getImm(); @@ -4546,10 +5025,10 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist( make_range(MachineBasicBlock::iterator(SCCDefInst), SCCDefInst.getParent()->end())) { // Exit if we find another SCC def. - if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1) + if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) return; - if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1) + if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) Worklist.insert(&MI); } } @@ -4716,7 +5195,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, return AMDGPU::NoRegister; assert(!MI.memoperands_empty() && - (*MI.memoperands_begin())->getAddrSpace() == ST.getAMDGPUAS().PRIVATE_ADDRESS); + (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS); FrameIndex = Addr->getIndex(); return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); @@ -4777,12 +5256,6 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { // If we have a definitive size, we can use it. Otherwise we need to inspect // the operands to know the size. - // - // FIXME: Instructions that have a base 32-bit encoding report their size as - // 4, even though they are really 8 bytes if they have a literal operand. - if (DescSize != 0 && DescSize != 4) - return DescSize; - if (isFixedSize(MI)) return DescSize; @@ -4791,23 +5264,27 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { if (isVALU(MI) || isSALU(MI)) { int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); if (Src0Idx == -1) - return 4; // No operands. + return DescSize; // No operands. if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) - return 8; + return DescSize + 4; int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); if (Src1Idx == -1) - return 4; + return DescSize; if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) - return 8; + return DescSize + 4; - return 4; - } + int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + if (Src2Idx == -1) + return DescSize; - if (DescSize == 4) - return 4; + if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx])) + return DescSize + 4; + + return DescSize; + } switch (Opc) { case TargetOpcode::IMPLICIT_DEF: @@ -4823,7 +5300,7 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); } default: - llvm_unreachable("unable to find instruction size"); + return DescSize; } } @@ -4835,7 +5312,7 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { return true; for (const MachineMemOperand *MMO : MI.memoperands()) { - if (MMO->getAddrSpace() == ST.getAMDGPUAS().FLAT_ADDRESS) + if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) return true; } return false; @@ -5069,3 +5546,84 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { return MCOp; } + +static +TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) { + assert(RegOpnd.isReg()); + return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() : + getRegSubRegPair(RegOpnd); +} + +TargetInstrInfo::RegSubRegPair +llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) { + assert(MI.isRegSequence()); + for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I) + if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) { + auto &RegOp = MI.getOperand(1 + 2 * I); + return getRegOrUndef(RegOp); + } + return TargetInstrInfo::RegSubRegPair(); +} + +// Try to find the definition of reg:subreg in subreg-manipulation pseudos +// Following a subreg of reg:subreg isn't supported +static bool followSubRegDef(MachineInstr &MI, + TargetInstrInfo::RegSubRegPair &RSR) { + if (!RSR.SubReg) + return false; + switch (MI.getOpcode()) { + default: break; + case AMDGPU::REG_SEQUENCE: + RSR = getRegSequenceSubReg(MI, RSR.SubReg); + return true; + // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg + case AMDGPU::INSERT_SUBREG: + if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm()) + // inserted the subreg we're looking for + RSR = getRegOrUndef(MI.getOperand(2)); + else { // the subreg in the rest of the reg + auto R1 = getRegOrUndef(MI.getOperand(1)); + if (R1.SubReg) // subreg of subreg isn't supported + return false; + RSR.Reg = R1.Reg; + } + return true; + } + return false; +} + +MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, + MachineRegisterInfo &MRI) { + assert(MRI.isSSA()); + if (!TargetRegisterInfo::isVirtualRegister(P.Reg)) + return nullptr; + + auto RSR = P; + auto *DefInst = MRI.getVRegDef(RSR.Reg); + while (auto *MI = DefInst) { + DefInst = nullptr; + switch (MI->getOpcode()) { + case AMDGPU::COPY: + case AMDGPU::V_MOV_B32_e32: { + auto &Op1 = MI->getOperand(1); + if (Op1.isReg() && + TargetRegisterInfo::isVirtualRegister(Op1.getReg())) { + if (Op1.isUndef()) + return nullptr; + RSR = getRegSubRegPair(Op1); + DefInst = MRI.getVRegDef(RSR.Reg); + } + break; + } + default: + if (followSubRegDef(*MI, RSR)) { + if (!RSR.Reg) + return nullptr; + DefInst = MRI.getVRegDef(RSR.Reg); + } + } + if (!DefInst) + return MI; + } + return nullptr; +} diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index d681b926504ed..5b1a05f3785ec 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -37,6 +37,7 @@ namespace llvm { class APInt; +class MachineDominatorTree; class MachineRegisterInfo; class RegScavenger; class GCNSubtarget; @@ -79,8 +80,8 @@ public: private: void swapOperands(MachineInstr &Inst) const; - bool moveScalarAddSub(SetVectorType &Worklist, - MachineInstr &Inst) const; + bool moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT = nullptr) const; void lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const; @@ -88,14 +89,26 @@ private: void lowerScalarXnor(SetVectorType &Worklist, MachineInstr &Inst) const; + void splitScalarNotBinop(SetVectorType &Worklist, + MachineInstr &Inst, + unsigned Opcode) const; + + void splitScalarBinOpN2(SetVectorType &Worklist, + MachineInstr &Inst, + unsigned Opcode) const; + void splitScalar64BitUnaryOp(SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const; - void splitScalar64BitAddSub(SetVectorType &Worklist, - MachineInstr &Inst) const; + void splitScalar64BitAddSub(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT = nullptr) const; + + void splitScalar64BitBinaryOp(SetVectorType &Worklist, MachineInstr &Inst, + unsigned Opcode, + MachineDominatorTree *MDT = nullptr) const; - void splitScalar64BitBinaryOp(SetVectorType &Worklist, - MachineInstr &Inst, unsigned Opcode) const; + void splitScalar64BitXnor(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT = nullptr) const; void splitScalar64BitBCNT(SetVectorType &Worklist, MachineInstr &Inst) const; @@ -160,12 +173,11 @@ public: int64_t &Offset1, int64_t &Offset2) const override; - bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, - int64_t &Offset, - const TargetRegisterInfo *TRI) const final; + bool getMemOperandWithOffset(MachineInstr &LdSt, MachineOperand *&BaseOp, + int64_t &Offset, + const TargetRegisterInfo *TRI) const final; - bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1, - MachineInstr &SecondLdSt, unsigned BaseReg2, + bool shouldClusterMemOps(MachineOperand &BaseOp1, MachineOperand &BaseOp2, unsigned NumLoads) const override; bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, @@ -225,6 +237,9 @@ public: bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; + bool findCommutedOpIndices(MCInstrDesc Desc, unsigned & SrcOpIdx0, + unsigned & SrcOpIdx1) const; + bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override; @@ -276,7 +291,7 @@ public: unsigned TrueReg, unsigned FalseReg) const; unsigned getAddressSpaceForPseudoSourceKind( - PseudoSourceValue::PSVKind Kind) const override; + unsigned Kind) const override; bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, @@ -589,6 +604,14 @@ public: return MI.getDesc().TSFlags & ClampFlags; } + static bool usesFPDPRounding(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::FPDPRounding; + } + + bool usesFPDPRounding(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::FPDPRounding; + } + bool isVGPRCopy(const MachineInstr &MI) const { assert(MI.isCopy()); unsigned Dest = MI.getOperand(0).getReg(); @@ -689,6 +712,12 @@ public: unsigned OpName) const; bool hasAnyModifiersSet(const MachineInstr &MI) const; + bool canShrink(const MachineInstr &MI, + const MachineRegisterInfo &MRI) const; + + MachineInstr *buildShrunkInst(MachineInstr &MI, + unsigned NewOpcode) const; + bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override; @@ -719,6 +748,16 @@ public: /// This form should usually be preferred since it handles operands /// with unknown register classes. unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const { + const MachineOperand &MO = MI.getOperand(OpNo); + if (MO.isReg()) { + if (unsigned SubReg = MO.getSubReg()) { + assert(RI.getRegSizeInBits(*RI.getSubClassWithSubReg( + MI.getParent()->getParent()->getRegInfo(). + getRegClass(MO.getReg()), SubReg)) >= 32 && + "Sub-dword subregs are not supported"); + return RI.getSubRegIndexLaneMask(SubReg).getNumLanes() * 4; + } + } return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8; } @@ -777,14 +816,16 @@ public: MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const; - /// Legalize all operands in this instruction. This function may - /// create new instruction and insert them before \p MI. - void legalizeOperands(MachineInstr &MI) const; + /// Legalize all operands in this instruction. This function may create new + /// instructions and control-flow around \p MI. If present, \p MDT is + /// updated. + void legalizeOperands(MachineInstr &MI, + MachineDominatorTree *MDT = nullptr) const; /// Replace this instruction's opcode with the equivalent VALU /// opcode. This function will also move the users of \p MI to the - /// VALU if necessary. - void moveToVALU(MachineInstr &MI) const; + /// VALU if necessary. If present, \p MDT is updated. + void moveToVALU(MachineInstr &MI, MachineDominatorTree *MDT = nullptr) const; void insertWaitStates(MachineBasicBlock &MBB,MachineBasicBlock::iterator MI, int Count) const; @@ -885,9 +926,36 @@ public: /// Return -1 if the target-specific opcode for the pseudo instruction does /// not exist. If Opcode is not a pseudo instruction, this is identity. int pseudoToMCOpcode(int Opcode) const; - }; +/// \brief Returns true if a reg:subreg pair P has a TRC class +inline bool isOfRegClass(const TargetInstrInfo::RegSubRegPair &P, + const TargetRegisterClass &TRC, + MachineRegisterInfo &MRI) { + auto *RC = MRI.getRegClass(P.Reg); + if (!P.SubReg) + return RC == &TRC; + auto *TRI = MRI.getTargetRegisterInfo(); + return RC == TRI->getMatchingSuperRegClass(RC, &TRC, P.SubReg); +} + +/// \brief Create RegSubRegPair from a register MachineOperand +inline +TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O) { + assert(O.isReg()); + return TargetInstrInfo::RegSubRegPair(O.getReg(), O.getSubReg()); +} + +/// \brief Return the SubReg component from REG_SEQUENCE +TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, + unsigned SubReg); + +/// \brief Return the defining instruction for a given reg:subreg pair +/// skipping copy like instructions and subreg-manipulation pseudos. +/// Following another subreg of a reg:subreg isn't supported. +MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, + MachineRegisterInfo &MRI); + namespace AMDGPU { LLVM_READONLY @@ -900,6 +968,9 @@ namespace AMDGPU { int getSDWAOp(uint16_t Opcode); LLVM_READONLY + int getDPPOp32(uint16_t Opcode); + + LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode); LLVM_READONLY @@ -911,6 +982,12 @@ namespace AMDGPU { LLVM_READONLY int getAddr64Inst(uint16_t Opcode); + /// Check if \p Opcode is an Addr64 opcode. + /// + /// \returns \p Opcode if it is an Addr64 opcode, otherwise -1. + LLVM_READONLY + int getIfAddr64Inst(uint16_t Opcode); + LLVM_READONLY int getMUBUFNoLdsInst(uint16_t Opcode); @@ -923,6 +1000,9 @@ namespace AMDGPU { LLVM_READONLY int getSOPKOp(uint16_t Opcode); + LLVM_READONLY + int getGlobalSaddrOp(uint16_t Opcode); + const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 8fa37aa83daed..13afa4d4974bf 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -40,9 +40,9 @@ def SIEncodingFamily { def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>; -def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT", - SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>, - [SDNPMayLoad, SDNPMemOperand] +def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD", + SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>]>, + [SDNPMayLoad, SDNPMemOperand] >; def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2, @@ -69,36 +69,34 @@ def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32, [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; -def SDTbuffer_load : SDTypeProfile<1, 9, +def SDTtbuffer_load : SDTypeProfile<1, 8, [ // vdata SDTCisVT<1, v4i32>, // rsrc SDTCisVT<2, i32>, // vindex(VGPR) SDTCisVT<3, i32>, // voffset(VGPR) SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) - SDTCisVT<6, i32>, // dfmt(imm) - SDTCisVT<7, i32>, // nfmt(imm) - SDTCisVT<8, i32>, // glc(imm) - SDTCisVT<9, i32> // slc(imm) + SDTCisVT<6, i32>, // format(imm) + SDTCisVT<7, i32>, // cachecontrol(imm) + SDTCisVT<8, i1> // idxen(imm) ]>; -def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTbuffer_load, +def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTtbuffer_load, [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; def SItbuffer_load_d16 : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT_D16", - SDTbuffer_load, + SDTtbuffer_load, [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]>; -def SDTtbuffer_store : SDTypeProfile<0, 10, +def SDTtbuffer_store : SDTypeProfile<0, 9, [ // vdata SDTCisVT<1, v4i32>, // rsrc SDTCisVT<2, i32>, // vindex(VGPR) SDTCisVT<3, i32>, // voffset(VGPR) SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) - SDTCisVT<6, i32>, // dfmt(imm) - SDTCisVT<7, i32>, // nfmt(imm) - SDTCisVT<8, i32>, // glc(imm) - SDTCisVT<9, i32> // slc(imm) + SDTCisVT<6, i32>, // format(imm) + SDTCisVT<7, i32>, // cachecontrol(imm) + SDTCisVT<8, i1> // idxen(imm) ]>; def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store, @@ -110,13 +108,15 @@ def SItbuffer_store_d16 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_D16", SDTtbuffer_store, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; -def SDTBufferLoad : SDTypeProfile<1, 5, +def SDTBufferLoad : SDTypeProfile<1, 7, [ // vdata SDTCisVT<1, v4i32>, // rsrc - SDTCisVT<2, i32>, // vindex - SDTCisVT<3, i32>, // offset - SDTCisVT<4, i1>, // glc - SDTCisVT<5, i1>]>; // slc + SDTCisVT<2, i32>, // vindex(VGPR) + SDTCisVT<3, i32>, // voffset(VGPR) + SDTCisVT<4, i32>, // soffset(SGPR) + SDTCisVT<5, i32>, // offset(imm) + SDTCisVT<6, i32>, // cachepolicy(imm) + SDTCisVT<7, i1>]>; // idxen(imm) def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; @@ -126,13 +126,15 @@ def SIbuffer_load_format_d16 : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT_D16", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; -def SDTBufferStore : SDTypeProfile<0, 6, +def SDTBufferStore : SDTypeProfile<0, 8, [ // vdata SDTCisVT<1, v4i32>, // rsrc - SDTCisVT<2, i32>, // vindex - SDTCisVT<3, i32>, // offset - SDTCisVT<4, i1>, // glc - SDTCisVT<5, i1>]>; // slc + SDTCisVT<2, i32>, // vindex(VGPR) + SDTCisVT<3, i32>, // voffset(VGPR) + SDTCisVT<4, i32>, // soffset(SGPR) + SDTCisVT<5, i32>, // offset(imm) + SDTCisVT<6, i32>, // cachepolicy(imm) + SDTCisVT<7, i1>]>; // idxen(imm) def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore, [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; @@ -144,13 +146,16 @@ def SIbuffer_store_format_d16 : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT_D16", [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>; class SDBufferAtomic<string opcode> : SDNode <opcode, - SDTypeProfile<1, 5, + SDTypeProfile<1, 8, [SDTCisVT<0, i32>, // dst SDTCisVT<1, i32>, // vdata SDTCisVT<2, v4i32>, // rsrc - SDTCisVT<3, i32>, // vindex - SDTCisVT<4, i32>, // offset - SDTCisVT<5, i1>]>, // slc + SDTCisVT<3, i32>, // vindex(VGPR) + SDTCisVT<4, i32>, // voffset(VGPR) + SDTCisVT<5, i32>, // soffset(SGPR) + SDTCisVT<6, i32>, // offset(imm) + SDTCisVT<7, i32>, // cachepolicy(imm) + SDTCisVT<8, i1>]>, // idxen(imm) [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] >; @@ -166,14 +171,17 @@ def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">; def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">; def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", - SDTypeProfile<1, 6, + SDTypeProfile<1, 9, [SDTCisVT<0, i32>, // dst SDTCisVT<1, i32>, // src SDTCisVT<2, i32>, // cmp SDTCisVT<3, v4i32>, // rsrc - SDTCisVT<4, i32>, // vindex - SDTCisVT<5, i32>, // offset - SDTCisVT<6, i1>]>, // slc + SDTCisVT<4, i32>, // vindex(VGPR) + SDTCisVT<5, i32>, // voffset(VGPR) + SDTCisVT<6, i32>, // soffset(SGPR) + SDTCisVT<7, i32>, // offset(imm) + SDTCisVT<8, i32>, // cachepolicy(imm) + SDTCisVT<9, i1>]>, // idxen(imm) [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] >; @@ -487,24 +495,7 @@ class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{ }]>; class VGPRImm <dag frag> : PatLeaf<frag, [{ - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { - return false; - } - const SIRegisterInfo *SIRI = - static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); - unsigned Limit = 0; - for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); - Limit < 10 && U != E; ++U, ++Limit) { - const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); - - // If the register class is unknown, it could be an unknown - // register class that needs to be an SGPR, e.g. an inline asm - // constraint - if (!RC || SIRI->isSGPRClass(RC)) - return false; - } - - return Limit < 10; + return isVGPRImm(N); }]>; def NegateImm : SDNodeXForm<imm, [{ @@ -746,14 +737,13 @@ def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>; def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>; def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>; def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>; -def R128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>; +def R128A16 : NamedOperandBit<"R128A16", NamedMatchClass<"R128A16">>; def D16 : NamedOperandBit<"D16", NamedMatchClass<"D16">>; def LWE : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>; def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>; def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>; -def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>; -def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>; +def FORMAT : NamedOperandU8<"FORMAT", NamedMatchClass<"FORMAT">>; def DMask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>; @@ -1632,7 +1622,7 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, 0, // 64-bit dst - No DPP or SDWA for 64-bit operands !if(!eq(Src0VT.Size, 64), 0, // 64-bit src0 - !if(!eq(Src0VT.Size, 64), + !if(!eq(Src1VT.Size, 64), 0, // 64-bit src2 1 ) @@ -1641,6 +1631,12 @@ class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, ); } +class getHasDPP <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, + ValueType Src1VT = i32> { + bit ret = !if(!eq(NumSrcArgs, 0), 0, + getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret); +} + class BitOr<bit a, bit b> { bit ret = !if(a, 1, !if(b, 1, 0)); } @@ -1649,6 +1645,11 @@ class BitAnd<bit a, bit b> { bit ret = !if(a, !if(b, 1, 0), 0); } +def PatGenMode { + int NoPattern = 0; + int Pattern = 1; +} + class VOPProfile <list<ValueType> _ArgVT> { field list<ValueType> ArgVT = _ArgVT; @@ -1715,7 +1716,10 @@ class VOPProfile <list<ValueType> _ArgVT> { field bit HasSDWAOMod = isFloatType<DstVT>.ret; field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; - field bit HasSDWA9 = HasExt; + field bit HasExtDPP = getHasDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; + field bit HasExtSDWA = HasExt; + field bit HasExtSDWA9 = HasExt; + field int NeedPatGen = PatGenMode.NoPattern; field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods); field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods); @@ -1743,8 +1747,10 @@ class VOPProfile <list<ValueType> _ArgVT> { getOpSelMod<Src0VT>.ret, getOpSelMod<Src1VT>.ret, getOpSelMod<Src2VT>.ret>.ret; - field dag InsDPP = getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs, - HasModifiers, Src0ModDPP, Src1ModDPP>.ret; + field dag InsDPP = !if(HasExtDPP, + getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs, + HasModifiers, Src0ModDPP, Src1ModDPP>.ret, + (ins)); field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs, HasSDWAOMod, Src0ModSDWA, Src1ModSDWA, DstVT>.ret; @@ -1758,14 +1764,21 @@ class VOPProfile <list<ValueType> _ArgVT> { HasSrc0FloatMods, HasSrc1FloatMods, HasSrc2FloatMods>.ret; - field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; + field string AsmDPP = !if(HasExtDPP, + getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret, ""); field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret; field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret; } class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> { let HasExt = 0; - let HasSDWA9 = 0; + let HasExtDPP = 0; + let HasExtSDWA = 0; + let HasExtSDWA9 = 0; +} + +class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.Pattern> : VOPProfile <p.ArgVT> { + let NeedPatGen = mode; } def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>; @@ -1788,6 +1801,8 @@ def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>; def VOP_V2F16_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, v2f16]>; def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>; +def VOP_V2I16_F32_F32 : VOPProfile <[v2i16, f32, f32, untyped]>; +def VOP_V2I16_I32_I32 : VOPProfile <[v2i16, i32, i32, untyped]>; def VOP_F32_V2F16_V2F16_V2F16 : VOPProfile <[f32, v2f16, v2f16, v2f16]>; @@ -1925,6 +1940,15 @@ def getBasicFromSDWAOp : InstrMapping { let ValueCols = [["Default"]]; } +// Maps ordinary instructions to their DPP counterparts +def getDPPOp32 : InstrMapping { + let FilterClass = "VOP"; + let RowFields = ["OpName"]; + let ColFields = ["AsmVariantName"]; + let KeyCol = ["Default"]; + let ValueCols = [["DPP"]]; +} + // Maps an commuted opcode to its original version def getCommuteOrig : InstrMapping { let FilterClass = "Commutable_REV"; @@ -1977,6 +2001,14 @@ def getAddr64Inst : InstrMapping { let ValueCols = [["1"]]; } +def getIfAddr64Inst : InstrMapping { + let FilterClass = "MUBUFAddr64Table"; + let RowFields = ["OpName"]; + let ColFields = ["IsAddr64"]; + let KeyCol = ["1"]; + let ValueCols = [["1"]]; +} + def getMUBUFNoLdsInst : InstrMapping { let FilterClass = "MUBUFLdsTable"; let RowFields = ["OpName"]; @@ -2003,6 +2035,15 @@ def getAtomicNoRetOp : InstrMapping { let ValueCols = [["0"]]; } +// Maps a GLOBAL to its SADDR form. +def getGlobalSaddrOp : InstrMapping { + let FilterClass = "GlobalSaddrTable"; + let RowFields = ["SaddrOp"]; + let ColFields = ["IsSaddr"]; + let KeyCol = ["0"]; + let ValueCols = [["1"]]; +} + include "SIInstructions.td" include "DSInstructions.td" diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 5c10646161b39..b6b00c2e4257a 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -15,8 +15,8 @@ class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateContro let SubtargetPredicate = isGCN; } -include "VOPInstructions.td" include "SOPInstructions.td" +include "VOPInstructions.td" include "SMInstructions.td" include "FLATInstructions.td" include "BUFInstructions.td" @@ -164,29 +164,26 @@ def S_SUB_U64_CO_PSEUDO : SPseudoInstSI < } // End usesCustomInserter = 1, Defs = [SCC] -let usesCustomInserter = 1, SALU = 1 in { -def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins), +let usesCustomInserter = 1 in { +def GET_GROUPSTATICSIZE : SPseudoInstSI <(outs SReg_32:$sdst), (ins), [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; } // End let usesCustomInserter = 1, SALU = 1 -def S_MOV_B64_term : PseudoInstSI<(outs SReg_64:$dst), +def S_MOV_B64_term : SPseudoInstSI<(outs SReg_64:$dst), (ins SSrc_b64:$src0)> { - let SALU = 1; let isAsCheapAsAMove = 1; let isTerminator = 1; } -def S_XOR_B64_term : PseudoInstSI<(outs SReg_64:$dst), +def S_XOR_B64_term : SPseudoInstSI<(outs SReg_64:$dst), (ins SSrc_b64:$src0, SSrc_b64:$src1)> { - let SALU = 1; let isAsCheapAsAMove = 1; let isTerminator = 1; let Defs = [SCC]; } -def S_ANDN2_B64_term : PseudoInstSI<(outs SReg_64:$dst), +def S_ANDN2_B64_term : SPseudoInstSI<(outs SReg_64:$dst), (ins SSrc_b64:$src0, SSrc_b64:$src1)> { - let SALU = 1; let isAsCheapAsAMove = 1; let isTerminator = 1; } @@ -250,7 +247,7 @@ def SI_LOOP : CFPseudoInstSI < (outs), (ins SReg_64:$saved, brtarget:$target), [(AMDGPUloop i64:$saved, bb:$target)], 1, 1> { let Size = 8; - let isBranch = 0; + let isBranch = 1; let hasSideEffects = 1; } @@ -267,14 +264,6 @@ def SI_END_CF : CFPseudoInstSI < let mayStore = 1; } -def SI_BREAK : CFPseudoInstSI < - (outs SReg_64:$dst), (ins SReg_64:$src), - [(set i64:$dst, (int_amdgcn_break i64:$src))], 1> { - let Size = 4; - let isAsCheapAsAMove = 1; - let isReMaterializable = 1; -} - def SI_IF_BREAK : CFPseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src), [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> { @@ -283,14 +272,6 @@ def SI_IF_BREAK : CFPseudoInstSI < let isReMaterializable = 1; } -def SI_ELSE_BREAK : CFPseudoInstSI < - (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), - [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]> { - let Size = 4; - let isAsCheapAsAMove = 1; - let isReMaterializable = 1; -} - let Uses = [EXEC] in { multiclass PseudoInstKill <dag ins> { @@ -326,6 +307,7 @@ def SI_ILLEGAL_COPY : SPseudoInstSI < def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> { let isTerminator = 1; let usesCustomInserter = 1; + let isBranch = 1; } def SI_PS_LIVE : PseudoInstSI < @@ -598,7 +580,13 @@ def : Pat < (int_amdgcn_kill (i1 (setcc f32:$src, InlineFPImm<f32>:$imm, cond:$cond))), (SI_KILL_F32_COND_IMM_PSEUDO $src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond)) >; -// TODO: we could add more variants for other types of conditionals + + // TODO: we could add more variants for other types of conditionals + +def : Pat < + (int_amdgcn_icmp i1:$src, (i1 0), (i32 33)), + (COPY $src) // Return the SGPRs representing i1 src +>; //===----------------------------------------------------------------------===// // VOP1 Patterns @@ -730,12 +718,14 @@ defm : SelectPat <i32, V_CNDMASK_B32_e64>; defm : SelectPat <f16, V_CNDMASK_B32_e64>; defm : SelectPat <f32, V_CNDMASK_B32_e64>; +let AddedComplexity = 1 in { def : GCNPat < - (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)), + (i32 (add (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)), i32:$val)), (V_BCNT_U32_B32_e64 $popcnt, $val) >; +} def : GCNPat < - (i16 (add (i16 (trunc (ctpop i32:$popcnt))), i16:$val)), + (i16 (add (i16 (trunc (getDivergentFrag<ctpop>.ret i32:$popcnt))), i16:$val)), (V_BCNT_U32_B32_e64 $popcnt, $val) >; @@ -867,6 +857,8 @@ def : BitConvert <f64, v2f32, VReg_64>; def : BitConvert <v2f32, f64, VReg_64>; def : BitConvert <f64, v2i32, VReg_64>; def : BitConvert <v2i32, f64, VReg_64>; +def : BitConvert <v4i16, v4f16, VReg_64>; +def : BitConvert <v4f16, v4i16, VReg_64>; // FIXME: Make SGPR def : BitConvert <v2i32, v4f16, VReg_64>; @@ -1324,6 +1316,38 @@ def : GCNPat < >; def : GCNPat < + (i1 (add i1:$src0, i1:$src1)), + (S_XOR_B64 $src0, $src1) +>; + +def : GCNPat < + (i1 (sub i1:$src0, i1:$src1)), + (S_XOR_B64 $src0, $src1) +>; + +let AddedComplexity = 1 in { +def : GCNPat < + (i1 (add i1:$src0, (i1 -1))), + (S_NOT_B64 $src0) +>; + +def : GCNPat < + (i1 (sub i1:$src0, (i1 -1))), + (S_NOT_B64 $src0) +>; +} + +def : GCNPat < + (f16 (sint_to_fp i1:$src)), + (V_CVT_F16_F32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)) +>; + +def : GCNPat < + (f16 (uint_to_fp i1:$src)), + (V_CVT_F16_F32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src)) +>; + +def : GCNPat < (f32 (sint_to_fp i1:$src)), (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src) >; @@ -1464,13 +1488,32 @@ class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPa def : ExpPattern<AMDGPUexport, i32, EXP>; def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>; -// COPY_TO_REGCLASS is workaround tablegen bug from multiple outputs +// COPY is workaround tablegen bug from multiple outputs // from S_LSHL_B32's multiple outputs from implicit scc def. def : GCNPat < (v2i16 (build_vector (i16 0), i16:$src1)), - (v2i16 (COPY_TO_REGCLASS (S_LSHL_B32 i16:$src1, (i16 16)), SReg_32_XM0)) + (v2i16 (COPY (S_LSHL_B32 i16:$src1, (i16 16)))) +>; + +def : GCNPat < + (v2i16 (build_vector i16:$src0, (i16 undef))), + (v2i16 (COPY $src0)) +>; + +def : GCNPat < + (v2f16 (build_vector f16:$src0, (f16 undef))), + (v2f16 (COPY $src0)) +>; + +def : GCNPat < + (v2i16 (build_vector (i16 undef), i16:$src1)), + (v2i16 (COPY (S_LSHL_B32 $src1, (i32 16)))) >; +def : GCNPat < + (v2f16 (build_vector (f16 undef), f16:$src1)), + (v2f16 (COPY (S_LSHL_B32 $src1, (i32 16)))) +>; let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat < @@ -1501,15 +1544,15 @@ def : GCNPat < } // End SubtargetPredicate = HasVOP3PInsts -// def : GCNPat < -// (v2f16 (scalar_to_vector f16:$src0)), -// (COPY $src0) -// >; +def : GCNPat < + (v2f16 (scalar_to_vector f16:$src0)), + (COPY $src0) +>; -// def : GCNPat < -// (v2i16 (scalar_to_vector i16:$src0)), -// (COPY $src0) -// >; +def : GCNPat < + (v2i16 (scalar_to_vector i16:$src0)), + (COPY $src0) +>; def : GCNPat < (v4i16 (scalar_to_vector i16:$src0)), @@ -1587,18 +1630,19 @@ defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>; defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>; defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>; -def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>; -def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>; +defm : IntMed3Pat<V_MED3_I32, smin, smax, smin_oneuse, smax_oneuse>; +defm : IntMed3Pat<V_MED3_U32, umin, umax, umin_oneuse, umax_oneuse>; } // This matches 16 permutations of // max(min(x, y), min(max(x, y), z)) class FPMed3Pat<ValueType vt, + //SDPatternOperator max, SDPatternOperator min, Instruction med3Inst> : GCNPat< - (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), + (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), - (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), + (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))), (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) @@ -1606,28 +1650,41 @@ class FPMed3Pat<ValueType vt, class FP16Med3Pat<ValueType vt, Instruction med3Inst> : GCNPat< - (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), - (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), - (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), - (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), + (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), + (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), + (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), + (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))), (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE) >; -class Int16Med3Pat<Instruction med3Inst, +multiclass Int16Med3Pat<Instruction med3Inst, + SDPatternOperator min, SDPatternOperator max, SDPatternOperator max_oneuse, SDPatternOperator min_oneuse, - ValueType vt = i32> : GCNPat< + ValueType vt = i16> { + // This matches 16 permutations of + // max(min(x, y), min(max(x, y), z)) + def : GCNPat < (max (min_oneuse vt:$src0, vt:$src1), (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)), (med3Inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE) >; + // This matches 16 permutations of + // min(max(a, b), max(min(a, b), c)) + def : GCNPat < + (min (max_oneuse vt:$src0, vt:$src1), + (max_oneuse (min_oneuse vt:$src0, vt:$src1), vt:$src2)), + (med3Inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE) +>; +} + def : FPMed3Pat<f32, V_MED3_F32>; let OtherPredicates = [isGFX9] in { def : FP16Med3Pat<f16, V_MED3_F16>; -def : Int16Med3Pat<V_MED3_I16, smax, smax_oneuse, smin_oneuse, i16>; -def : Int16Med3Pat<V_MED3_U16, umax, umax_oneuse, umin_oneuse, i16>; +defm : Int16Med3Pat<V_MED3_I16, smin, smax, smax_oneuse, smin_oneuse>; +defm : Int16Med3Pat<V_MED3_U16, umin, umax, umax_oneuse, umin_oneuse>; } // End Predicates = [isGFX9] diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td index 7b7cf1635050b..e51ff4b4bc50e 100644 --- a/lib/Target/AMDGPU/SIIntrinsics.td +++ b/lib/Target/AMDGPU/SIIntrinsics.td @@ -16,36 +16,4 @@ let TargetPrefix = "SI", isTarget = 1 in { def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; - // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed - def int_SI_tbuffer_store : Intrinsic < - [], - [llvm_anyint_ty, // rsrc(SGPR) - llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32 - llvm_i32_ty, // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW - llvm_i32_ty, // vaddr(VGPR) - llvm_i32_ty, // soffset(SGPR) - llvm_i32_ty, // inst_offset(imm) - llvm_i32_ty, // dfmt(imm) - llvm_i32_ty, // nfmt(imm) - llvm_i32_ty, // offen(imm) - llvm_i32_ty, // idxen(imm) - llvm_i32_ty, // glc(imm) - llvm_i32_ty, // slc(imm) - llvm_i32_ty], // tfe(imm) - []>; - - // Fully-flexible BUFFER_LOAD_DWORD_* except for the ADDR64 bit, which is not exposed - def int_SI_buffer_load_dword : Intrinsic < - [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32 - [llvm_anyint_ty, // rsrc(SGPR) - llvm_anyint_ty, // vaddr(VGPR) - llvm_i32_ty, // soffset(SGPR) - llvm_i32_ty, // inst_offset(imm) - llvm_i32_ty, // offen(imm) - llvm_i32_ty, // idxen(imm) - llvm_i32_ty, // glc(imm) - llvm_i32_ty, // slc(imm) - llvm_i32_ty], // tfe(imm) - [IntrReadMem, IntrArgMemOnly]>; - } // End TargetPrefix = "SI", isTarget = 1 diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 4b537540046fe..be291b127301d 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -20,6 +20,26 @@ // ==> // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 // +// This pass also tries to promote constant offset to the immediate by +// adjusting the base. It tries to use a base from the nearby instructions that +// allows it to have a 13bit constant offset and then promotes the 13bit offset +// to the immediate. +// E.g. +// s_movk_i32 s0, 0x1800 +// v_add_co_u32_e32 v0, vcc, s0, v2 +// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +// +// s_movk_i32 s0, 0x1000 +// v_add_co_u32_e32 v5, vcc, s0, v2 +// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +// global_load_dwordx2 v[5:6], v[5:6], off +// global_load_dwordx2 v[0:1], v[0:1], off +// => +// s_movk_i32 s0, 0x1000 +// v_add_co_u32_e32 v5, vcc, s0, v2 +// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +// global_load_dwordx2 v[5:6], v[5:6], off +// global_load_dwordx2 v[0:1], v[5:6], off offset:2048 // // Future improvements: // @@ -43,9 +63,9 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" @@ -74,23 +94,38 @@ using namespace llvm; #define DEBUG_TYPE "si-load-store-opt" namespace { +enum InstClassEnum { + UNKNOWN, + DS_READ, + DS_WRITE, + S_BUFFER_LOAD_IMM, + BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN, + BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET, + BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN, + BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET, + BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact, + BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact, + BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact, + BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact, +}; -class SILoadStoreOptimizer : public MachineFunctionPass { - enum InstClassEnum { - DS_READ_WRITE, - S_BUFFER_LOAD_IMM, - BUFFER_LOAD_OFFEN, - BUFFER_LOAD_OFFSET, - BUFFER_STORE_OFFEN, - BUFFER_STORE_OFFSET, - }; +enum RegisterEnum { + SBASE = 0x1, + SRSRC = 0x2, + SOFFSET = 0x4, + VADDR = 0x8, + ADDR = 0x10, +}; +class SILoadStoreOptimizer : public MachineFunctionPass { struct CombineInfo { MachineBasicBlock::iterator I; MachineBasicBlock::iterator Paired; unsigned EltSize; unsigned Offset0; unsigned Offset1; + unsigned Width0; + unsigned Width1; unsigned BaseOff; InstClassEnum InstClass; bool GLC0; @@ -98,9 +133,23 @@ class SILoadStoreOptimizer : public MachineFunctionPass { bool SLC0; bool SLC1; bool UseST64; - bool IsX2; - SmallVector<MachineInstr*, 8> InstsToMove; - }; + SmallVector<MachineInstr *, 8> InstsToMove; + }; + + struct BaseRegisters { + unsigned LoReg = 0; + unsigned HiReg = 0; + + unsigned LoSubReg = 0; + unsigned HiSubReg = 0; + }; + + struct MemAddress { + BaseRegisters Base; + int64_t Offset = 0; + }; + + using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; private: const GCNSubtarget *STM = nullptr; @@ -108,9 +157,16 @@ private: const SIRegisterInfo *TRI = nullptr; MachineRegisterInfo *MRI = nullptr; AliasAnalysis *AA = nullptr; - unsigned CreatedX2; + bool OptimizeAgain; static bool offsetsCanBeCombined(CombineInfo &CI); + static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI); + static unsigned getNewOpcode(const CombineInfo &CI); + static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI); + const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI); + unsigned getOpcodeWidth(const MachineInstr &MI); + InstClassEnum getInstClass(unsigned Opc); + unsigned getRegs(unsigned Opc); bool findMatchingInst(CombineInfo &CI); @@ -123,10 +179,21 @@ private: MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); - unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2, - bool &IsOffen) const; MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); + void updateBaseAndOffset(MachineInstr &I, unsigned NewBase, + int32_t NewOffset); + unsigned computeBase(MachineInstr &MI, const MemAddress &Addr); + MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI); + Optional<int32_t> extractConstOffset(const MachineOperand &Op); + void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr); + /// Promotes constant offset to the immediate by adjusting the base. It + /// tries to use a base from the nearby instructions that allows it to have + /// a 13bit constant offset which gets promoted to the immediate. + bool promoteConstantOffsetToImm(MachineInstr &CI, + MemInfoMap &Visited, + SmallPtrSet<MachineInstr *, 4> &Promoted); + public: static char ID; @@ -153,8 +220,8 @@ public: INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, - "SI Load Store Optimizer", false, false) +INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", + false, false) char SILoadStoreOptimizer::ID = 0; @@ -165,7 +232,7 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass() { } static void moveInstsAfter(MachineBasicBlock::iterator I, - ArrayRef<MachineInstr*> InstsToMove) { + ArrayRef<MachineInstr *> InstsToMove) { MachineBasicBlock *MBB = I->getParent(); ++I; for (MachineInstr *MI : InstsToMove) { @@ -191,21 +258,19 @@ static void addDefsUsesToList(const MachineInstr &MI, static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, const SIInstrInfo *TII, - AliasAnalysis * AA) { + AliasAnalysis *AA) { // RAW or WAR - cannot reorder // WAW - cannot reorder // RAR - safe to reorder return !(A->mayStore() || B->mayStore()) || - TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); + TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); } // Add MI and its defs to the lists if MI reads one of the defs that are // already in the list. Returns true in that case. -static bool -addToListsIfDependent(MachineInstr &MI, - DenseSet<unsigned> &RegDefs, - DenseSet<unsigned> &PhysRegUses, - SmallVectorImpl<MachineInstr*> &Insts) { +static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs, + DenseSet<unsigned> &PhysRegUses, + SmallVectorImpl<MachineInstr *> &Insts) { for (MachineOperand &Use : MI.operands()) { // If one of the defs is read, then there is a use of Def between I and the // instruction that I will potentially be merged with. We will need to move @@ -228,18 +293,16 @@ addToListsIfDependent(MachineInstr &MI, return false; } -static bool -canMoveInstsAcrossMemOp(MachineInstr &MemOp, - ArrayRef<MachineInstr*> InstsToMove, - const SIInstrInfo *TII, - AliasAnalysis *AA) { +static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, + ArrayRef<MachineInstr *> InstsToMove, + const SIInstrInfo *TII, AliasAnalysis *AA) { assert(MemOp.mayLoadOrStore()); for (MachineInstr *InstToMove : InstsToMove) { if (!InstToMove->mayLoadOrStore()) continue; if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) - return false; + return false; } return true; } @@ -260,10 +323,9 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { CI.BaseOff = 0; // Handle SMEM and VMEM instructions. - if (CI.InstClass != DS_READ_WRITE) { - unsigned Diff = CI.IsX2 ? 2 : 1; - return (EltOffset0 + Diff == EltOffset1 || - EltOffset1 + Diff == EltOffset0) && + if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { + return (EltOffset0 + CI.Width0 == EltOffset1 || + EltOffset1 + CI.Width1 == EltOffset0) && CI.GLC0 == CI.GLC1 && (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1); } @@ -305,42 +367,176 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { return false; } +bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, + const CombineInfo &CI) { + const unsigned Width = (CI.Width0 + CI.Width1); + switch (CI.InstClass) { + default: + return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); + case S_BUFFER_LOAD_IMM: + switch (Width) { + default: + return false; + case 2: + case 4: + return true; + } + } +} + +unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) { + const unsigned Opc = MI.getOpcode(); + + if (TII->isMUBUF(MI)) { + return AMDGPU::getMUBUFDwords(Opc); + } + + switch (Opc) { + default: + return 0; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + return 1; + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + return 2; + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return 4; + } +} + +InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) { + if (TII->isMUBUF(Opc)) { + const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc); + + // If we couldn't identify the opcode, bail out. + if (baseOpcode == -1) { + return UNKNOWN; + } + + switch (baseOpcode) { + default: + return UNKNOWN; + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: + return BUFFER_LOAD_OFFEN; + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: + return BUFFER_LOAD_OFFSET; + case AMDGPU::BUFFER_STORE_DWORD_OFFEN: + return BUFFER_STORE_OFFEN; + case AMDGPU::BUFFER_STORE_DWORD_OFFSET: + return BUFFER_STORE_OFFSET; + case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: + return BUFFER_LOAD_OFFEN_exact; + case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: + return BUFFER_LOAD_OFFSET_exact; + case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: + return BUFFER_STORE_OFFEN_exact; + case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: + return BUFFER_STORE_OFFSET_exact; + } + } + + switch (Opc) { + default: + return UNKNOWN; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return S_BUFFER_LOAD_IMM; + case AMDGPU::DS_READ_B32: + case AMDGPU::DS_READ_B64: + case AMDGPU::DS_READ_B32_gfx9: + case AMDGPU::DS_READ_B64_gfx9: + return DS_READ; + case AMDGPU::DS_WRITE_B32: + case AMDGPU::DS_WRITE_B64: + case AMDGPU::DS_WRITE_B32_gfx9: + case AMDGPU::DS_WRITE_B64_gfx9: + return DS_WRITE; + } +} + +unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) { + if (TII->isMUBUF(Opc)) { + unsigned result = 0; + + if (AMDGPU::getMUBUFHasVAddr(Opc)) { + result |= VADDR; + } + + if (AMDGPU::getMUBUFHasSrsrc(Opc)) { + result |= SRSRC; + } + + if (AMDGPU::getMUBUFHasSoffset(Opc)) { + result |= SOFFSET; + } + + return result; + } + + switch (Opc) { + default: + return 0; + case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + return SBASE; + case AMDGPU::DS_READ_B32: + case AMDGPU::DS_READ_B64: + case AMDGPU::DS_READ_B32_gfx9: + case AMDGPU::DS_READ_B64_gfx9: + case AMDGPU::DS_WRITE_B32: + case AMDGPU::DS_WRITE_B64: + case AMDGPU::DS_WRITE_B32_gfx9: + case AMDGPU::DS_WRITE_B64_gfx9: + return ADDR; + } +} + bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); MachineBasicBlock::iterator E = MBB->end(); MachineBasicBlock::iterator MBBI = CI.I; - unsigned AddrOpName[3] = {0}; - int AddrIdx[3]; - const MachineOperand *AddrReg[3]; + const unsigned Opc = CI.I->getOpcode(); + const InstClassEnum InstClass = getInstClass(Opc); + + if (InstClass == UNKNOWN) { + return false; + } + + const unsigned Regs = getRegs(Opc); + + unsigned AddrOpName[5] = {0}; + int AddrIdx[5]; + const MachineOperand *AddrReg[5]; unsigned NumAddresses = 0; - switch (CI.InstClass) { - case DS_READ_WRITE: + if (Regs & ADDR) { AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; - break; - case S_BUFFER_LOAD_IMM: + } + + if (Regs & SBASE) { AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; - break; - case BUFFER_LOAD_OFFEN: - case BUFFER_STORE_OFFEN: - AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; - AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; - AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; - break; - case BUFFER_LOAD_OFFSET: - case BUFFER_STORE_OFFSET: + } + + if (Regs & SRSRC) { AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; + } + + if (Regs & SOFFSET) { AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; - break; + } + + if (Regs & VADDR) { + AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; } for (unsigned i = 0; i < NumAddresses; i++) { AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]); AddrReg[i] = &CI.I->getOperand(AddrIdx[i]); - // We only ever merge operations with the same base address register, so don't - // bother scanning forward if there are no other uses. + // We only ever merge operations with the same base address register, so + // don't bother scanning forward if there are no other uses. if (AddrReg[i]->isReg() && (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) || MRI->hasOneNonDBGUse(AddrReg[i]->getReg()))) @@ -353,8 +549,11 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { DenseSet<unsigned> PhysRegUsesToMove; addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); - for ( ; MBBI != E; ++MBBI) { - if (MBBI->getOpcode() != CI.I->getOpcode()) { + for (; MBBI != E; ++MBBI) { + const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE); + + if ((getInstClass(MBBI->getOpcode()) != InstClass) || + (IsDS && (MBBI->getOpcode() != Opc))) { // This is not a matching DS instruction, but we can keep looking as // long as one of these conditions are met: // 1. It is safe to move I down past MBBI. @@ -368,8 +567,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { } if (MBBI->mayLoadOrStore() && - (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || - !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { + (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { // We fail condition #1, but we may still be able to satisfy condition // #2. Add this instruction to the move list and then we will check // if condition #2 holds once we have selected the matching instruction. @@ -413,8 +612,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { continue; } - // Check same base pointer. Be careful of subregisters, which can occur with - // vectors of pointers. + // Check same base pointer. Be careful of subregisters, which can occur + // with vectors of pointers. if (AddrReg[i]->getReg() != AddrRegNext.getReg() || AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { Match = false; @@ -423,13 +622,15 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { } if (Match) { - int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), - AMDGPU::OpName::offset); + int OffsetIdx = + AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset); CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); + CI.Width0 = getOpcodeWidth(*CI.I); CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm(); + CI.Width1 = getOpcodeWidth(*MBBI); CI.Paired = MBBI; - if (CI.InstClass == DS_READ_WRITE) { + if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) { CI.Offset0 &= 0xffff; CI.Offset1 &= 0xffff; } else { @@ -445,7 +646,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { // We also need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged // instruction. - if (offsetsCanBeCombined(CI)) + if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI)) if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA)) return true; } @@ -472,12 +673,12 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { if (STM->ldsRequiresM0Init()) return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; - return (EltSize == 4) ? - AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9; + return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 + : AMDGPU::DS_READ2ST64_B64_gfx9; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird @@ -489,8 +690,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( unsigned NewOffset0 = CI.Offset0; unsigned NewOffset1 = CI.Offset1; - unsigned Opc = CI.UseST64 ? - read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); + unsigned Opc = + CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; @@ -502,39 +703,40 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( } assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && - (NewOffset0 != NewOffset1) && - "Computed offset doesn't fit"); + (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); const MCInstrDesc &Read2Desc = TII->get(Opc); - const TargetRegisterClass *SuperRC - = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; + const TargetRegisterClass *SuperRC = + (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; unsigned DestReg = MRI->createVirtualRegister(SuperRC); DebugLoc DL = CI.I->getDebugLoc(); unsigned BaseReg = AddrReg->getReg(); + unsigned BaseSubReg = AddrReg->getSubReg(); unsigned BaseRegFlags = 0; if (CI.BaseOff) { unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) - .addImm(CI.BaseOff); + .addImm(CI.BaseOff); BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) - .addReg(ImmReg) - .addReg(AddrReg->getReg()); + .addReg(ImmReg) + .addReg(AddrReg->getReg(), 0, BaseSubReg); + BaseSubReg = 0; } MachineInstrBuilder Read2 = - BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) - .addReg(BaseReg, BaseRegFlags) // addr - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg) + .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); (void)Read2; @@ -561,32 +763,36 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { if (STM->ldsRequiresM0Init()) return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; - return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9; + return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 + : AMDGPU::DS_WRITE2_B64_gfx9; } unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { if (STM->ldsRequiresM0Init()) - return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; + return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 + : AMDGPU::DS_WRITE2ST64_B64; - return (EltSize == 4) ? - AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9; + return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 + : AMDGPU::DS_WRITE2ST64_B64_gfx9; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); // Be sure to use .addOperand(), and not .addReg() with these. We want to be // sure we preserve the subregister index and any register flags set on them. - const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); - const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); - const MachineOperand *Data1 - = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); + const MachineOperand *AddrReg = + TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); + const MachineOperand *Data0 = + TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); + const MachineOperand *Data1 = + TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0); unsigned NewOffset0 = CI.Offset0; unsigned NewOffset1 = CI.Offset1; - unsigned Opc = CI.UseST64 ? - write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); + unsigned Opc = + CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); if (NewOffset0 > NewOffset1) { // Canonicalize the merged instruction so the smaller offset comes first. @@ -595,36 +801,37 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( } assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && - (NewOffset0 != NewOffset1) && - "Computed offset doesn't fit"); + (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); const MCInstrDesc &Write2Desc = TII->get(Opc); DebugLoc DL = CI.I->getDebugLoc(); unsigned BaseReg = AddrReg->getReg(); + unsigned BaseSubReg = AddrReg->getSubReg(); unsigned BaseRegFlags = 0; if (CI.BaseOff) { unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) - .addImm(CI.BaseOff); + .addImm(CI.BaseOff); BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg) - .addReg(ImmReg) - .addReg(AddrReg->getReg()); + .addReg(ImmReg) + .addReg(AddrReg->getReg(), 0, BaseSubReg); + BaseSubReg = 0; } MachineInstrBuilder Write2 = - BuildMI(*MBB, CI.Paired, DL, Write2Desc) - .addReg(BaseReg, BaseRegFlags) // addr - .add(*Data0) // data0 - .add(*Data1) // data1 - .addImm(NewOffset0) // offset0 - .addImm(NewOffset1) // offset1 - .addImm(0) // gds - .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + BuildMI(*MBB, CI.Paired, DL, Write2Desc) + .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr + .add(*Data0) // data0 + .add(*Data1) // data1 + .addImm(NewOffset0) // offset0 + .addImm(NewOffset1) // offset1 + .addImm(0) // gds + .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); moveInstsAfter(Write2, CI.InstsToMove); @@ -636,15 +843,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( return Next; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM : - AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + const unsigned Opcode = getNewOpcode(CI); + + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); - const TargetRegisterClass *SuperRC = - CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass; unsigned DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); @@ -652,14 +858,11 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) .addImm(MergedOffset) // offset .addImm(CI.GLC0) // glc - .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); - unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; - - // Handle descending offsets - if (CI.Offset0 > CI.Offset1) - std::swap(SubRegIdx0, SubRegIdx1); + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); // Copy to the old destination registers. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); @@ -681,29 +884,25 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( return Next; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( - CombineInfo &CI) { +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - unsigned Opcode; - if (CI.InstClass == BUFFER_LOAD_OFFEN) { - Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN : - AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; - } else { - Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET : - AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; - } + const unsigned Opcode = getNewOpcode(CI); - const TargetRegisterClass *SuperRC = - CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); + + // Copy to the new source register. unsigned DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); - if (CI.InstClass == BUFFER_LOAD_OFFEN) - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + const unsigned Regs = getRegs(Opcode); + + if (Regs & VADDR) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) @@ -711,14 +910,11 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( .addImm(CI.GLC0) // glc .addImm(CI.SLC0) // slc .addImm(0) // tfe - .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); - unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; - - // Handle descending offsets - if (CI.Offset0 > CI.Offset1) - std::swap(SubRegIdx0, SubRegIdx1); + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); // Copy to the old destination registers. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); @@ -740,57 +936,137 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( return Next; } -unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode( - const MachineInstr &I, bool &IsX2, bool &IsOffen) const { - IsX2 = false; - IsOffen = false; +unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { + const unsigned Width = CI.Width0 + CI.Width1; - switch (I.getOpcode()) { - case AMDGPU::BUFFER_STORE_DWORD_OFFEN: - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; - case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: - IsX2 = true; - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact: - IsX2 = true; - IsOffen = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact; - case AMDGPU::BUFFER_STORE_DWORD_OFFSET: - return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; - case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: - return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: - IsX2 = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; - case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact: - IsX2 = true; - return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact; + switch (CI.InstClass) { + default: + return AMDGPU::getMUBUFOpcode(CI.InstClass, Width); + case UNKNOWN: + llvm_unreachable("Unknown instruction class"); + case S_BUFFER_LOAD_IMM: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + case 4: + return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; + } } - return 0; } -MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( - CombineInfo &CI) { +std::pair<unsigned, unsigned> +SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) { + if (CI.Offset0 > CI.Offset1) { + switch (CI.Width0) { + default: + return std::make_pair(0, 0); + case 1: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub1, AMDGPU::sub0); + case 2: + return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1); + case 3: + return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2); + } + case 2: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0); + case 2: + return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1); + } + case 3: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0); + } + } + } else { + switch (CI.Width0) { + default: + return std::make_pair(0, 0); + case 1: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub0, AMDGPU::sub1); + case 2: + return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2); + case 3: + return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3); + } + case 2: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2); + case 2: + return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3); + } + case 3: + switch (CI.Width1) { + default: + return std::make_pair(0, 0); + case 1: + return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3); + } + } + } +} + +const TargetRegisterClass * +SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) { + if (CI.InstClass == S_BUFFER_LOAD_IMM) { + switch (CI.Width0 + CI.Width1) { + default: + return nullptr; + case 2: + return &AMDGPU::SReg_64_XEXECRegClass; + case 4: + return &AMDGPU::SReg_128RegClass; + case 8: + return &AMDGPU::SReg_256RegClass; + case 16: + return &AMDGPU::SReg_512RegClass; + } + } else { + switch (CI.Width0 + CI.Width1) { + default: + return nullptr; + case 2: + return &AMDGPU::VReg_64RegClass; + case 3: + return &AMDGPU::VReg_96RegClass; + case 4: + return &AMDGPU::VReg_128RegClass; + } + } +} + +MachineBasicBlock::iterator +SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - bool Unused1, Unused2; - unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2); - unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; - unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; + const unsigned Opcode = getNewOpcode(CI); - // Handle descending offsets - if (CI.Offset0 > CI.Offset1) - std::swap(SubRegIdx0, SubRegIdx1); + std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI); + const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); + const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); // Copy to the new source register. - const TargetRegisterClass *SuperRC = - CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI); unsigned SrcReg = MRI->createVirtualRegister(SuperRC); const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); @@ -803,18 +1079,20 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( .addImm(SubRegIdx1); auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) - .addReg(SrcReg, RegState::Kill); + .addReg(SrcReg, RegState::Kill); - if (CI.InstClass == BUFFER_STORE_OFFEN) - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + const unsigned Regs = getRegs(Opcode); + + if (Regs & VADDR) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) .addImm(std::min(CI.Offset0, CI.Offset1)) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.SLC0) // slc - .addImm(0) // tfe - .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); moveInstsAfter(MIB, CI.InstsToMove); @@ -824,105 +1102,399 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( return Next; } +MachineOperand +SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) { + APInt V(32, Val, true); + if (TII->isInlineConstant(V)) + return MachineOperand::CreateImm(Val); + + unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + MachineInstr *Mov = + BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), + TII->get(AMDGPU::S_MOV_B32), Reg) + .addImm(Val); + (void)Mov; + LLVM_DEBUG(dbgs() << " "; Mov->dump()); + return MachineOperand::CreateReg(Reg, false); +} + +// Compute base address using Addr and return the final register. +unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, + const MemAddress &Addr) { + MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::iterator MBBI = MI.getIterator(); + DebugLoc DL = MI.getDebugLoc(); + + assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || + Addr.Base.LoSubReg) && + "Expected 32-bit Base-Register-Low!!"); + + assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || + Addr.Base.HiSubReg) && + "Expected 32-bit Base-Register-Hi!!"); + + LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); + MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); + MachineOperand OffsetHi = + createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); + unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned DeadCarryReg = + MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineInstr *LoHalf = + BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0) + .addReg(CarryReg, RegState::Define) + .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) + .add(OffsetLo); + (void)LoHalf; + LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); + + MachineInstr *HiHalf = + BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) + .addReg(DeadCarryReg, RegState::Define | RegState::Dead) + .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) + .add(OffsetHi) + .addReg(CarryReg, RegState::Kill); + (void)HiHalf; + LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); + + unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); + MachineInstr *FullBase = + BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + (void)FullBase; + LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); + + return FullDestReg; +} + +// Update base and offset with the NewBase and NewOffset in MI. +void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, + unsigned NewBase, + int32_t NewOffset) { + TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase); + TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); +} + +Optional<int32_t> +SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) { + if (Op.isImm()) + return Op.getImm(); + + if (!Op.isReg()) + return None; + + MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); + if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || + !Def->getOperand(1).isImm()) + return None; + + return Def->getOperand(1).getImm(); +} + +// Analyze Base and extracts: +// - 32bit base registers, subregisters +// - 64bit constant offset +// Expecting base computation as: +// %OFFSET0:sgpr_32 = S_MOV_B32 8000 +// %LO:vgpr_32, %c:sreg_64_xexec = +// V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, +// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec +// %Base:vreg_64 = +// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 +void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, + MemAddress &Addr) { + if (!Base.isReg()) + return; + + MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); + if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE + || Def->getNumOperands() != 5) + return; + + MachineOperand BaseLo = Def->getOperand(1); + MachineOperand BaseHi = Def->getOperand(3); + if (!BaseLo.isReg() || !BaseHi.isReg()) + return; + + MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); + MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); + + if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 || + !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) + return; + + const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); + const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); + + auto Offset0P = extractConstOffset(*Src0); + if (Offset0P) + BaseLo = *Src1; + else { + if (!(Offset0P = extractConstOffset(*Src1))) + return; + BaseLo = *Src0; + } + + Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); + Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); + + if (Src0->isImm()) + std::swap(Src0, Src1); + + if (!Src1->isImm()) + return; + + uint64_t Offset1 = Src1->getImm(); + BaseHi = *Src0; + + Addr.Base.LoReg = BaseLo.getReg(); + Addr.Base.HiReg = BaseHi.getReg(); + Addr.Base.LoSubReg = BaseLo.getSubReg(); + Addr.Base.HiSubReg = BaseHi.getSubReg(); + Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); +} + +bool SILoadStoreOptimizer::promoteConstantOffsetToImm( + MachineInstr &MI, + MemInfoMap &Visited, + SmallPtrSet<MachineInstr *, 4> &AnchorList) { + + // TODO: Support flat and scratch. + if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 || + TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) + return false; + + // TODO: Support Store. + if (!MI.mayLoad()) + return false; + + if (AnchorList.count(&MI)) + return false; + + LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); + + if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { + LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); + return false; + } + + // Step1: Find the base-registers and a 64bit constant offset. + MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); + MemAddress MAddr; + if (Visited.find(&MI) == Visited.end()) { + processBaseWithConstOffset(Base, MAddr); + Visited[&MI] = MAddr; + } else + MAddr = Visited[&MI]; + + if (MAddr.Offset == 0) { + LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" + " constant offsets that can be promoted.\n";); + return false; + } + + LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " + << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); + + // Step2: Traverse through MI's basic block and find an anchor(that has the + // same base-registers) with the highest 13bit distance from MI's offset. + // E.g. (64bit loads) + // bb: + // addr1 = &a + 4096; load1 = load(addr1, 0) + // addr2 = &a + 6144; load2 = load(addr2, 0) + // addr3 = &a + 8192; load3 = load(addr3, 0) + // addr4 = &a + 10240; load4 = load(addr4, 0) + // addr5 = &a + 12288; load5 = load(addr5, 0) + // + // Starting from the first load, the optimization will try to find a new base + // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 + // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 + // as the new-base(anchor) because of the maximum distance which can + // accomodate more intermediate bases presumeably. + // + // Step3: move (&a + 8192) above load1. Compute and promote offsets from + // (&a + 8192) for load1, load2, load4. + // addr = &a + 8192 + // load1 = load(addr, -4096) + // load2 = load(addr, -2048) + // load3 = load(addr, 0) + // load4 = load(addr, 2048) + // addr5 = &a + 12288; load5 = load(addr5, 0) + // + MachineInstr *AnchorInst = nullptr; + MemAddress AnchorAddr; + uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); + SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; + + MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::iterator E = MBB->end(); + MachineBasicBlock::iterator MBBI = MI.getIterator(); + ++MBBI; + const SITargetLowering *TLI = + static_cast<const SITargetLowering *>(STM->getTargetLowering()); + + for ( ; MBBI != E; ++MBBI) { + MachineInstr &MINext = *MBBI; + // TODO: Support finding an anchor(with same base) from store addresses or + // any other load addresses where the opcodes are different. + if (MINext.getOpcode() != MI.getOpcode() || + TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) + continue; + + const MachineOperand &BaseNext = + *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); + MemAddress MAddrNext; + if (Visited.find(&MINext) == Visited.end()) { + processBaseWithConstOffset(BaseNext, MAddrNext); + Visited[&MINext] = MAddrNext; + } else + MAddrNext = Visited[&MINext]; + + if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || + MAddrNext.Base.HiReg != MAddr.Base.HiReg || + MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || + MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) + continue; + + InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); + + int64_t Dist = MAddr.Offset - MAddrNext.Offset; + TargetLoweringBase::AddrMode AM; + AM.HasBaseReg = true; + AM.BaseOffs = Dist; + if (TLI->isLegalGlobalAddressingMode(AM) && + (uint32_t)std::abs(Dist) > MaxDist) { + MaxDist = std::abs(Dist); + + AnchorAddr = MAddrNext; + AnchorInst = &MINext; + } + } + + if (AnchorInst) { + LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; + AnchorInst->dump()); + LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " + << AnchorAddr.Offset << "\n\n"); + + // Instead of moving up, just re-compute anchor-instruction's base address. + unsigned Base = computeBase(MI, AnchorAddr); + + updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); + LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); + + for (auto P : InstsWCommonBase) { + TargetLoweringBase::AddrMode AM; + AM.HasBaseReg = true; + AM.BaseOffs = P.second - AnchorAddr.Offset; + + if (TLI->isLegalGlobalAddressingMode(AM)) { + LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; + dbgs() << ")"; P.first->dump()); + updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); + LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); + } + } + AnchorList.insert(AnchorInst); + return true; + } + + return false; +} + // Scan through looking for adjacent LDS operations with constant offsets from // the same base register. We rely on the scheduler to do the hard work of // clustering nearby loads, and assume these are all adjacent. bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { bool Modified = false; + // Contain the list + MemInfoMap Visited; + // Contains the list of instructions for which constant offsets are being + // promoted to the IMM. + SmallPtrSet<MachineInstr *, 4> AnchorList; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { MachineInstr &MI = *I; + if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) + Modified = true; + // Don't combine if volatile. if (MI.hasOrderedMemoryRef()) { ++I; continue; } + const unsigned Opc = MI.getOpcode(); + CombineInfo CI; CI.I = I; - unsigned Opc = MI.getOpcode(); - if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 || - Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) { + CI.InstClass = getInstClass(Opc); - CI.InstClass = DS_READ_WRITE; + switch (CI.InstClass) { + default: + break; + case DS_READ: CI.EltSize = - (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4; - + (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 + : 4; if (findMatchingInst(CI)) { Modified = true; I = mergeRead2Pair(CI); } else { ++I; } - continue; - } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 || - Opc == AMDGPU::DS_WRITE_B32_gfx9 || - Opc == AMDGPU::DS_WRITE_B64_gfx9) { - CI.InstClass = DS_READ_WRITE; - CI.EltSize - = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4; - + case DS_WRITE: + CI.EltSize = + (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 + : 4; if (findMatchingInst(CI)) { Modified = true; I = mergeWrite2Pair(CI); } else { ++I; } - continue; - } - if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || - Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) { - // EltSize is in units of the offset encoding. - CI.InstClass = S_BUFFER_LOAD_IMM; + case S_BUFFER_LOAD_IMM: CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); - CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; if (findMatchingInst(CI)) { Modified = true; I = mergeSBufferLoadImmPair(CI); - if (!CI.IsX2) - CreatedX2++; + OptimizeAgain |= (CI.Width0 + CI.Width1) < 16; } else { ++I; } continue; - } - if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) { - if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN) - CI.InstClass = BUFFER_LOAD_OFFEN; - else - CI.InstClass = BUFFER_LOAD_OFFSET; - + case BUFFER_LOAD_OFFEN: + case BUFFER_LOAD_OFFSET: + case BUFFER_LOAD_OFFEN_exact: + case BUFFER_LOAD_OFFSET_exact: CI.EltSize = 4; - CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || - Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; if (findMatchingInst(CI)) { Modified = true; I = mergeBufferLoadPair(CI); - if (!CI.IsX2) - CreatedX2++; + OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; } else { ++I; } continue; - } - - bool StoreIsX2, IsOffen; - if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) { - CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET; + case BUFFER_STORE_OFFEN: + case BUFFER_STORE_OFFSET: + case BUFFER_STORE_OFFEN_exact: + case BUFFER_STORE_OFFSET_exact: CI.EltSize = 4; - CI.IsX2 = StoreIsX2; if (findMatchingInst(CI)) { Modified = true; I = mergeBufferStorePair(CI); - if (!CI.IsX2) - CreatedX2++; + OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; } else { ++I; } @@ -956,12 +1528,10 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { bool Modified = false; for (MachineBasicBlock &MBB : MF) { - CreatedX2 = 0; - Modified |= optimizeBlock(MBB); - - // Run again to convert x2 to x4. - if (CreatedX2 >= 1) + do { + OptimizeAgain = false; Modified |= optimizeBlock(MBB); + } while (OptimizeAgain); } return Modified; diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp index ad30317c344c3..1aa1feebbdae6 100644 --- a/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -85,9 +85,7 @@ private: void emitIf(MachineInstr &MI); void emitElse(MachineInstr &MI); - void emitBreak(MachineInstr &MI); void emitIfBreak(MachineInstr &MI); - void emitElseBreak(MachineInstr &MI); void emitLoop(MachineInstr &MI); void emitEndCf(MachineInstr &MI); @@ -329,20 +327,6 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI)); } -void SILowerControlFlow::emitBreak(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - unsigned Dst = MI.getOperand(0).getReg(); - - MachineInstr *Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) - .addReg(AMDGPU::EXEC) - .add(MI.getOperand(1)); - - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *Or); - MI.eraseFromParent(); -} - void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); @@ -384,11 +368,6 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { MI.eraseFromParent(); } -void SILowerControlFlow::emitElseBreak(MachineInstr &MI) { - // Lowered in the same way as emitIfBreak above. - emitIfBreak(MI); -} - void SILowerControlFlow::emitLoop(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); @@ -515,18 +494,10 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { emitElse(MI); break; - case AMDGPU::SI_BREAK: - emitBreak(MI); - break; - case AMDGPU::SI_IF_BREAK: emitIfBreak(MI); break; - case AMDGPU::SI_ELSE_BREAK: - emitElseBreak(MI); - break; - case AMDGPU::SI_LOOP: emitLoop(MI); break; diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp index ecc6cff407e18..eb038bb5d5fcf 100644 --- a/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -5,37 +5,61 @@ // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // -/// i1 values are usually inserted by the CFG Structurize pass and they are -/// unique in that they can be copied from VALU to SALU registers. -/// This is not possible for any other value type. Since there are no -/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1. -/// //===----------------------------------------------------------------------===// // +// This pass lowers all occurrences of i1 values (with a vreg_1 register class) +// to lane masks (64-bit scalar registers). The pass assumes machine SSA form +// and a wave-level control flow graph. +// +// Before this pass, values that are semantically i1 and are defined and used +// within the same basic block are already represented as lane masks in scalar +// registers. However, values that cross basic blocks are always transferred +// between basic blocks in vreg_1 virtual registers and are lowered by this +// pass. +// +// The only instructions that use or define vreg_1 virtual registers are COPY, +// PHI, and IMPLICIT_DEF. +// +//===----------------------------------------------------------------------===// -#define DEBUG_TYPE "si-i1-copies" #include "AMDGPU.h" #include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "Utils/AMDGPULaneDominator.h" -#include "llvm/CodeGen/LiveIntervals.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSSAUpdater.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetMachine.h" +#define DEBUG_TYPE "si-i1-copies" + using namespace llvm; +static unsigned createLaneMaskReg(MachineFunction &MF); +static unsigned insertUndefLaneMask(MachineBasicBlock &MBB); + namespace { class SILowerI1Copies : public MachineFunctionPass { public: static char ID; +private: + MachineFunction *MF = nullptr; + MachineDominatorTree *DT = nullptr; + MachinePostDominatorTree *PDT = nullptr; + MachineRegisterInfo *MRI = nullptr; + const GCNSubtarget *ST = nullptr; + const SIInstrInfo *TII = nullptr; + + DenseSet<unsigned> ConstrainRegs; + public: SILowerI1Copies() : MachineFunctionPass(ID) { initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); @@ -47,14 +71,337 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachinePostDominatorTree>(); MachineFunctionPass::getAnalysisUsage(AU); } + +private: + void lowerCopiesFromI1(); + void lowerPhis(); + void lowerCopiesToI1(); + bool isConstantLaneMask(unsigned Reg, bool &Val) const; + void buildMergeLaneMasks(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned DstReg, unsigned PrevReg, unsigned CurReg); + MachineBasicBlock::iterator + getSaluInsertionAtEnd(MachineBasicBlock &MBB) const; + + bool isLaneMaskReg(unsigned Reg) const { + return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) && + TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) == + ST->getWavefrontSize(); + } +}; + +/// Helper class that determines the relationship between incoming values of a +/// phi in the control flow graph to determine where an incoming value can +/// simply be taken as a scalar lane mask as-is, and where it needs to be +/// merged with another, previously defined lane mask. +/// +/// The approach is as follows: +/// - Determine all basic blocks which, starting from the incoming blocks, +/// a wave may reach before entering the def block (the block containing the +/// phi). +/// - If an incoming block has no predecessors in this set, we can take the +/// incoming value as a scalar lane mask as-is. +/// -- A special case of this is when the def block has a self-loop. +/// - Otherwise, the incoming value needs to be merged with a previously +/// defined lane mask. +/// - If there is a path into the set of reachable blocks that does _not_ go +/// through an incoming block where we can take the scalar lane mask as-is, +/// we need to invent an available value for the SSAUpdater. Choices are +/// 0 and undef, with differing consequences for how to merge values etc. +/// +/// TODO: We could use region analysis to quickly skip over SESE regions during +/// the traversal. +/// +class PhiIncomingAnalysis { + MachinePostDominatorTree &PDT; + + // For each reachable basic block, whether it is a source in the induced + // subgraph of the CFG. + DenseMap<MachineBasicBlock *, bool> ReachableMap; + SmallVector<MachineBasicBlock *, 4> ReachableOrdered; + SmallVector<MachineBasicBlock *, 4> Stack; + SmallVector<MachineBasicBlock *, 4> Predecessors; + +public: + PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {} + + /// Returns whether \p MBB is a source in the induced subgraph of reachable + /// blocks. + bool isSource(MachineBasicBlock &MBB) const { + return ReachableMap.find(&MBB)->second; + } + + ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; } + + void analyze(MachineBasicBlock &DefBlock, + ArrayRef<MachineBasicBlock *> IncomingBlocks) { + assert(Stack.empty()); + ReachableMap.clear(); + ReachableOrdered.clear(); + Predecessors.clear(); + + // Insert the def block first, so that it acts as an end point for the + // traversal. + ReachableMap.try_emplace(&DefBlock, false); + ReachableOrdered.push_back(&DefBlock); + + for (MachineBasicBlock *MBB : IncomingBlocks) { + if (MBB == &DefBlock) { + ReachableMap[&DefBlock] = true; // self-loop on DefBlock + continue; + } + + ReachableMap.try_emplace(MBB, false); + ReachableOrdered.push_back(MBB); + + // If this block has a divergent terminator and the def block is its + // post-dominator, the wave may first visit the other successors. + bool Divergent = false; + for (MachineInstr &MI : MBB->terminators()) { + if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO || + MI.getOpcode() == AMDGPU::SI_IF || + MI.getOpcode() == AMDGPU::SI_ELSE || + MI.getOpcode() == AMDGPU::SI_LOOP) { + Divergent = true; + break; + } + } + + if (Divergent && PDT.dominates(&DefBlock, MBB)) { + for (MachineBasicBlock *Succ : MBB->successors()) + Stack.push_back(Succ); + } + } + + while (!Stack.empty()) { + MachineBasicBlock *MBB = Stack.pop_back_val(); + if (!ReachableMap.try_emplace(MBB, false).second) + continue; + ReachableOrdered.push_back(MBB); + + for (MachineBasicBlock *Succ : MBB->successors()) + Stack.push_back(Succ); + } + + for (MachineBasicBlock *MBB : ReachableOrdered) { + bool HaveReachablePred = false; + for (MachineBasicBlock *Pred : MBB->predecessors()) { + if (ReachableMap.count(Pred)) { + HaveReachablePred = true; + } else { + Stack.push_back(Pred); + } + } + if (!HaveReachablePred) + ReachableMap[MBB] = true; + if (HaveReachablePred) { + for (MachineBasicBlock *UnreachablePred : Stack) { + if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end()) + Predecessors.push_back(UnreachablePred); + } + } + Stack.clear(); + } + } +}; + +/// Helper class that detects loops which require us to lower an i1 COPY into +/// bitwise manipulation. +/// +/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish +/// between loops with the same header. Consider this example: +/// +/// A-+-+ +/// | | | +/// B-+ | +/// | | +/// C---+ +/// +/// A is the header of a loop containing A, B, and C as far as LoopInfo is +/// concerned. However, an i1 COPY in B that is used in C must be lowered to +/// bitwise operations to combine results from different loop iterations when +/// B has a divergent branch (since by default we will compile this code such +/// that threads in a wave are merged at the entry of C). +/// +/// The following rule is implemented to determine whether bitwise operations +/// are required: use the bitwise lowering for a def in block B if a backward +/// edge to B is reachable without going through the nearest common +/// post-dominator of B and all uses of the def. +/// +/// TODO: This rule is conservative because it does not check whether the +/// relevant branches are actually divergent. +/// +/// The class is designed to cache the CFG traversal so that it can be re-used +/// for multiple defs within the same basic block. +/// +/// TODO: We could use region analysis to quickly skip over SESE regions during +/// the traversal. +/// +class LoopFinder { + MachineDominatorTree &DT; + MachinePostDominatorTree &PDT; + + // All visited / reachable block, tagged by level (level 0 is the def block, + // level 1 are all blocks reachable including but not going through the def + // block's IPDOM, etc.). + DenseMap<MachineBasicBlock *, unsigned> Visited; + + // Nearest common dominator of all visited blocks by level (level 0 is the + // def block). Used for seeding the SSAUpdater. + SmallVector<MachineBasicBlock *, 4> CommonDominators; + + // Post-dominator of all visited blocks. + MachineBasicBlock *VisitedPostDom = nullptr; + + // Level at which a loop was found: 0 is not possible; 1 = a backward edge is + // reachable without going through the IPDOM of the def block (if the IPDOM + // itself has an edge to the def block, the loop level is 2), etc. + unsigned FoundLoopLevel = ~0u; + + MachineBasicBlock *DefBlock = nullptr; + SmallVector<MachineBasicBlock *, 4> Stack; + SmallVector<MachineBasicBlock *, 4> NextLevel; + +public: + LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT) + : DT(DT), PDT(PDT) {} + + void initialize(MachineBasicBlock &MBB) { + Visited.clear(); + CommonDominators.clear(); + Stack.clear(); + NextLevel.clear(); + VisitedPostDom = nullptr; + FoundLoopLevel = ~0u; + + DefBlock = &MBB; + } + + /// Check whether a backward edge can be reached without going through the + /// given \p PostDom of the def block. + /// + /// Return the level of \p PostDom if a loop was found, or 0 otherwise. + unsigned findLoop(MachineBasicBlock *PostDom) { + MachineDomTreeNode *PDNode = PDT.getNode(DefBlock); + + if (!VisitedPostDom) + advanceLevel(); + + unsigned Level = 0; + while (PDNode->getBlock() != PostDom) { + if (PDNode->getBlock() == VisitedPostDom) + advanceLevel(); + PDNode = PDNode->getIDom(); + Level++; + if (FoundLoopLevel == Level) + return Level; + } + + return 0; + } + + /// Add undef values dominating the loop and the optionally given additional + /// blocks, so that the SSA updater doesn't have to search all the way to the + /// function entry. + void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, + ArrayRef<MachineBasicBlock *> Blocks = {}) { + assert(LoopLevel < CommonDominators.size()); + + MachineBasicBlock *Dom = CommonDominators[LoopLevel]; + for (MachineBasicBlock *MBB : Blocks) + Dom = DT.findNearestCommonDominator(Dom, MBB); + + if (!inLoopLevel(*Dom, LoopLevel, Blocks)) { + SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom)); + } else { + // The dominator is part of the loop or the given blocks, so add the + // undef value to unreachable predecessors instead. + for (MachineBasicBlock *Pred : Dom->predecessors()) { + if (!inLoopLevel(*Pred, LoopLevel, Blocks)) + SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred)); + } + } + } + +private: + bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, + ArrayRef<MachineBasicBlock *> Blocks) const { + auto DomIt = Visited.find(&MBB); + if (DomIt != Visited.end() && DomIt->second <= LoopLevel) + return true; + + if (llvm::find(Blocks, &MBB) != Blocks.end()) + return true; + + return false; + } + + void advanceLevel() { + MachineBasicBlock *VisitedDom; + + if (!VisitedPostDom) { + VisitedPostDom = DefBlock; + VisitedDom = DefBlock; + Stack.push_back(DefBlock); + } else { + VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock(); + VisitedDom = CommonDominators.back(); + + for (unsigned i = 0; i < NextLevel.size();) { + if (PDT.dominates(VisitedPostDom, NextLevel[i])) { + Stack.push_back(NextLevel[i]); + + NextLevel[i] = NextLevel.back(); + NextLevel.pop_back(); + } else { + i++; + } + } + } + + unsigned Level = CommonDominators.size(); + while (!Stack.empty()) { + MachineBasicBlock *MBB = Stack.pop_back_val(); + if (!PDT.dominates(VisitedPostDom, MBB)) + NextLevel.push_back(MBB); + + Visited[MBB] = Level; + VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB); + + for (MachineBasicBlock *Succ : MBB->successors()) { + if (Succ == DefBlock) { + if (MBB == VisitedPostDom) + FoundLoopLevel = std::min(FoundLoopLevel, Level + 1); + else + FoundLoopLevel = std::min(FoundLoopLevel, Level); + continue; + } + + if (Visited.try_emplace(Succ, ~0u).second) { + if (MBB == VisitedPostDom) + NextLevel.push_back(Succ); + else + Stack.push_back(Succ); + } + } + } + + CommonDominators.push_back(VisitedDom); + } }; } // End anonymous namespace. -INITIALIZE_PASS(SILowerI1Copies, DEBUG_TYPE, - "SI Lower i1 Copies", false, false) +INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, + false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, + false) char SILowerI1Copies::ID = 0; @@ -64,104 +411,415 @@ FunctionPass *llvm::createSILowerI1CopiesPass() { return new SILowerI1Copies(); } -bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { +static unsigned createLaneMaskReg(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); + return MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); +} + +static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) { + MachineFunction &MF = *MBB.getParent(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); - const TargetRegisterInfo *TRI = &TII->getRegisterInfo(); + unsigned UndefReg = createLaneMaskReg(MF); + BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF), + UndefReg); + return UndefReg; +} - std::vector<unsigned> I1Defs; +/// Lower all instructions that def or use vreg_1 registers. +/// +/// In a first pass, we lower COPYs from vreg_1 to vector registers, as can +/// occur around inline assembly. We do this first, before vreg_1 registers +/// are changed to scalar mask registers. +/// +/// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before +/// all others, because phi lowering looks through copies and can therefore +/// often make copy lowering unnecessary. +bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { + MF = &TheMF; + MRI = &MF->getRegInfo(); + DT = &getAnalysis<MachineDominatorTree>(); + PDT = &getAnalysis<MachinePostDominatorTree>(); - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { + ST = &MF->getSubtarget<GCNSubtarget>(); + TII = ST->getInstrInfo(); - MachineBasicBlock &MBB = *BI; - MachineBasicBlock::iterator I, Next; - for (I = MBB.begin(); I != MBB.end(); I = Next) { - Next = std::next(I); - MachineInstr &MI = *I; + lowerCopiesFromI1(); + lowerPhis(); + lowerCopiesToI1(); - if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) { - unsigned Reg = MI.getOperand(0).getReg(); - const TargetRegisterClass *RC = MRI.getRegClass(Reg); - if (RC == &AMDGPU::VReg_1RegClass) - MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass); - continue; - } + for (unsigned Reg : ConstrainRegs) + MRI->constrainRegClass(Reg, &AMDGPU::SReg_64_XEXECRegClass); + ConstrainRegs.clear(); + return true; +} + +void SILowerI1Copies::lowerCopiesFromI1() { + SmallVector<MachineInstr *, 4> DeadCopies; + + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { if (MI.getOpcode() != AMDGPU::COPY) continue; - const MachineOperand &Dst = MI.getOperand(0); - const MachineOperand &Src = MI.getOperand(1); - - if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) || - !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned SrcReg = MI.getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || + MRI->getRegClass(SrcReg) != &AMDGPU::VReg_1RegClass) continue; - const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg()); - const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg()); + if (isLaneMaskReg(DstReg) || + (TargetRegisterInfo::isVirtualRegister(DstReg) && + MRI->getRegClass(DstReg) == &AMDGPU::VReg_1RegClass)) + continue; + // Copy into a 32-bit vector register. + LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); DebugLoc DL = MI.getDebugLoc(); - MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg()); - if (DstRC == &AMDGPU::VReg_1RegClass && - TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) { - I1Defs.push_back(Dst.getReg()); - if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) { - if (DefInst->getOperand(1).isImm()) { - I1Defs.push_back(Dst.getReg()); + assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32); + assert(!MI.getOperand(0).getSubReg()); - int64_t Val = DefInst->getOperand(1).getImm(); - assert(Val == 0 || Val == -1); + ConstrainRegs.insert(SrcReg); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) + .addImm(-1) + .addReg(SrcReg); + DeadCopies.push_back(&MI); + } - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32)) - .add(Dst) - .addImm(Val); - MI.eraseFromParent(); - continue; + for (MachineInstr *MI : DeadCopies) + MI->eraseFromParent(); + DeadCopies.clear(); + } +} + +void SILowerI1Copies::lowerPhis() { + MachineSSAUpdater SSAUpdater(*MF); + LoopFinder LF(*DT, *PDT); + PhiIncomingAnalysis PIA(*PDT); + SmallVector<MachineInstr *, 4> DeadPhis; + SmallVector<MachineBasicBlock *, 4> IncomingBlocks; + SmallVector<unsigned, 4> IncomingRegs; + SmallVector<unsigned, 4> IncomingUpdated; + + for (MachineBasicBlock &MBB : *MF) { + LF.initialize(MBB); + + for (MachineInstr &MI : MBB.phis()) { + unsigned DstReg = MI.getOperand(0).getReg(); + if (MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass) + continue; + + LLVM_DEBUG(dbgs() << "Lower PHI: " << MI); + + MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass); + + // Collect incoming values. + for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { + assert(i + 1 < MI.getNumOperands()); + unsigned IncomingReg = MI.getOperand(i).getReg(); + MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB(); + MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); + + if (IncomingDef->getOpcode() == AMDGPU::COPY) { + IncomingReg = IncomingDef->getOperand(1).getReg(); + assert(isLaneMaskReg(IncomingReg)); + assert(!IncomingDef->getOperand(1).getSubReg()); + } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { + continue; + } else { + assert(IncomingDef->isPHI()); + } + + IncomingBlocks.push_back(IncomingMBB); + IncomingRegs.push_back(IncomingReg); + } + + // Phis in a loop that are observed outside the loop receive a simple but + // conservatively correct treatment. + MachineBasicBlock *PostDomBound = &MBB; + for (MachineInstr &Use : MRI->use_instructions(DstReg)) { + PostDomBound = + PDT->findNearestCommonDominator(PostDomBound, Use.getParent()); + } + + unsigned FoundLoopLevel = LF.findLoop(PostDomBound); + + SSAUpdater.Initialize(DstReg); + + if (FoundLoopLevel) { + LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); + + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + IncomingUpdated.push_back(createLaneMaskReg(*MF)); + SSAUpdater.AddAvailableValue(IncomingBlocks[i], + IncomingUpdated.back()); + } + + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + buildMergeLaneMasks( + IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], + SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); + } + } else { + // The phi is not observed from outside a loop. Use a more accurate + // lowering. + PIA.analyze(MBB, IncomingBlocks); + + for (MachineBasicBlock *MBB : PIA.predecessors()) + SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); + + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + if (PIA.isSource(IMBB)) { + IncomingUpdated.push_back(0); + SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]); + } else { + IncomingUpdated.push_back(createLaneMaskReg(*MF)); + SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back()); } } - unsigned int TmpSrc = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::COPY), TmpSrc) - .add(Src); - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64)) - .add(Dst) - .addImm(0) - .addImm(-1) - .addReg(TmpSrc); - MI.eraseFromParent(); - } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) && - SrcRC == &AMDGPU::VReg_1RegClass) { - if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 && - DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() && - DefInst->getOperand(1).getImm() == 0 && - DefInst->getOperand(2).getImm() != 0 && - DefInst->getOperand(3).isReg() && - TargetRegisterInfo::isVirtualRegister( - DefInst->getOperand(3).getReg()) && - TRI->getCommonSubClass( - MRI.getRegClass(DefInst->getOperand(3).getReg()), - &AMDGPU::SGPR_64RegClass) && - AMDGPU::laneDominates(DefInst->getParent(), &MBB)) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64)) - .add(Dst) - .addReg(AMDGPU::EXEC) - .add(DefInst->getOperand(3)); - } else { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64)) - .add(Dst) - .add(Src) - .addImm(0); + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + if (!IncomingUpdated[i]) + continue; + + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + buildMergeLaneMasks( + IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], + SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); } - MI.eraseFromParent(); } + + unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); + if (NewReg != DstReg) { + MRI->replaceRegWith(NewReg, DstReg); + + // Ensure that DstReg has a single def and mark the old PHI node for + // deletion. + MI.getOperand(0).setReg(NewReg); + DeadPhis.push_back(&MI); + } + + IncomingBlocks.clear(); + IncomingRegs.clear(); + IncomingUpdated.clear(); } + + for (MachineInstr *MI : DeadPhis) + MI->eraseFromParent(); + DeadPhis.clear(); } +} + +void SILowerI1Copies::lowerCopiesToI1() { + MachineSSAUpdater SSAUpdater(*MF); + LoopFinder LF(*DT, *PDT); + SmallVector<MachineInstr *, 4> DeadCopies; + + for (MachineBasicBlock &MBB : *MF) { + LF.initialize(MBB); - for (unsigned Reg : I1Defs) - MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass); + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF && + MI.getOpcode() != AMDGPU::COPY) + continue; + + unsigned DstReg = MI.getOperand(0).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(DstReg) || + MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass) + continue; + + if (MRI->use_empty(DstReg)) { + DeadCopies.push_back(&MI); + continue; + } + + LLVM_DEBUG(dbgs() << "Lower Other: " << MI); + + MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass); + if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) + continue; + + DebugLoc DL = MI.getDebugLoc(); + unsigned SrcReg = MI.getOperand(1).getReg(); + assert(!MI.getOperand(1).getSubReg()); + + if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || + !isLaneMaskReg(SrcReg)) { + assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); + unsigned TmpReg = createLaneMaskReg(*MF); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) + .addReg(SrcReg) + .addImm(0); + MI.getOperand(1).setReg(TmpReg); + SrcReg = TmpReg; + } + + // Defs in a loop that are observed outside the loop must be transformed + // into appropriate bit manipulation. + MachineBasicBlock *PostDomBound = &MBB; + for (MachineInstr &Use : MRI->use_instructions(DstReg)) { + PostDomBound = + PDT->findNearestCommonDominator(PostDomBound, Use.getParent()); + } + + unsigned FoundLoopLevel = LF.findLoop(PostDomBound); + if (FoundLoopLevel) { + SSAUpdater.Initialize(DstReg); + SSAUpdater.AddAvailableValue(&MBB, DstReg); + LF.addLoopEntries(FoundLoopLevel, SSAUpdater); + + buildMergeLaneMasks(MBB, MI, DL, DstReg, + SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg); + DeadCopies.push_back(&MI); + } + } + + for (MachineInstr *MI : DeadCopies) + MI->eraseFromParent(); + DeadCopies.clear(); + } +} + +bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const { + const MachineInstr *MI; + for (;;) { + MI = MRI->getUniqueVRegDef(Reg); + if (MI->getOpcode() != AMDGPU::COPY) + break; + + Reg = MI->getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return false; + if (!isLaneMaskReg(Reg)) + return false; + } + + if (MI->getOpcode() != AMDGPU::S_MOV_B64) + return false; + + if (!MI->getOperand(1).isImm()) + return false; + + int64_t Imm = MI->getOperand(1).getImm(); + if (Imm == 0) { + Val = false; + return true; + } + if (Imm == -1) { + Val = true; + return true; + } return false; } + +static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) { + Def = false; + Use = false; + + for (const MachineOperand &MO : MI.operands()) { + if (MO.isReg() && MO.getReg() == AMDGPU::SCC) { + if (MO.isUse()) + Use = true; + else + Def = true; + } + } +} + +/// Return a point at the end of the given \p MBB to insert SALU instructions +/// for lane mask calculation. Take terminators and SCC into account. +MachineBasicBlock::iterator +SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { + auto InsertionPt = MBB.getFirstTerminator(); + bool TerminatorsUseSCC = false; + for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) { + bool DefsSCC; + instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC); + if (TerminatorsUseSCC || DefsSCC) + break; + } + + if (!TerminatorsUseSCC) + return InsertionPt; + + while (InsertionPt != MBB.begin()) { + InsertionPt--; + + bool DefSCC, UseSCC; + instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC); + if (DefSCC) + return InsertionPt; + } + + // We should have at least seen an IMPLICIT_DEF or COPY + llvm_unreachable("SCC used by terminator but no def in block"); +} + +void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, unsigned DstReg, + unsigned PrevReg, unsigned CurReg) { + bool PrevVal; + bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal); + bool CurVal; + bool CurConstant = isConstantLaneMask(CurReg, CurVal); + + if (PrevConstant && CurConstant) { + if (PrevVal == CurVal) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg); + } else if (CurVal) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(AMDGPU::EXEC); + } else { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), DstReg) + .addReg(AMDGPU::EXEC) + .addImm(-1); + } + return; + } + + unsigned PrevMaskedReg = 0; + unsigned CurMaskedReg = 0; + if (!PrevConstant) { + if (CurConstant && CurVal) { + PrevMaskedReg = PrevReg; + } else { + PrevMaskedReg = createLaneMaskReg(*MF); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ANDN2_B64), PrevMaskedReg) + .addReg(PrevReg) + .addReg(AMDGPU::EXEC); + } + } + if (!CurConstant) { + // TODO: check whether CurReg is already masked by EXEC + if (PrevConstant && PrevVal) { + CurMaskedReg = CurReg; + } else { + CurMaskedReg = createLaneMaskReg(*MF); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), CurMaskedReg) + .addReg(CurReg) + .addReg(AMDGPU::EXEC); + } + } + + if (PrevConstant && !PrevVal) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) + .addReg(CurMaskedReg); + } else if (CurConstant && !CurVal) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) + .addReg(PrevMaskedReg); + } else if (PrevConstant && PrevVal) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ORN2_B64), DstReg) + .addReg(CurMaskedReg) + .addReg(AMDGPU::EXEC); + } else { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_OR_B64), DstReg) + .addReg(PrevMaskedReg) + .addReg(CurMaskedReg ? CurMaskedReg : (unsigned)AMDGPU::EXEC); + } +} diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 0d5ff75e37ed8..181cc41bd5ff7 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -117,7 +117,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) } const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - bool MaySpill = ST.isVGPRSpillingEnabled(F); bool HasStackObjects = FrameInfo.hasStackObjects(); if (isEntryFunction()) { @@ -126,21 +125,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (WorkItemIDZ) WorkItemIDY = true; - if (HasStackObjects || MaySpill) { - PrivateSegmentWaveByteOffset = true; + PrivateSegmentWaveByteOffset = true; // HS and GS always have the scratch wave offset in SGPR5 on GFX9. if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 && (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS)) - ArgInfo.PrivateSegmentWaveByteOffset - = ArgDescriptor::createRegister(AMDGPU::SGPR5); - } + ArgInfo.PrivateSegmentWaveByteOffset = + ArgDescriptor::createRegister(AMDGPU::SGPR5); } - bool IsCOV2 = ST.isAmdCodeObjectV2(F); - if (IsCOV2) { - if (HasStackObjects || MaySpill) - PrivateSegmentBuffer = true; + bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); + if (isAmdHsaOrMesa) { + PrivateSegmentBuffer = true; if (F.hasFnAttribute("amdgpu-dispatch-ptr")) DispatchPtr = true; @@ -151,14 +147,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (F.hasFnAttribute("amdgpu-dispatch-id")) DispatchID = true; } else if (ST.isMesaGfxShader(F)) { - if (HasStackObjects || MaySpill) - ImplicitBufferPtr = true; + ImplicitBufferPtr = true; } if (F.hasFnAttribute("amdgpu-kernarg-segment-ptr")) KernargSegmentPtr = true; - if (ST.hasFlatAddressSpace() && isEntryFunction() && IsCOV2) { + if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) { // TODO: This could be refined a lot. The attribute is a poor way of // detecting calls that may require it before argument lowering. if (HasStackObjects || F.hasFnAttribute("amdgpu-flat-scratch")) diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp index 18754442898f7..fb7e670068fe6 100644 --- a/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -471,7 +471,7 @@ void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) { #ifndef NDEBUG if (SuccSU->NumPredsLeft == 0) { dbgs() << "*** Scheduling failed! ***\n"; - SuccSU->dump(DAG); + DAG->dumpNode(*SuccSU); dbgs() << " has been released too many times!\n"; llvm_unreachable(nullptr); } @@ -611,13 +611,11 @@ void SIScheduleBlock::printDebug(bool full) { dbgs() << "\nInstructions:\n"; if (!Scheduled) { - for (SUnit* SU : SUnits) { - SU->dump(DAG); - } + for (const SUnit* SU : SUnits) + DAG->dumpNode(*SU); } else { - for (SUnit* SU : SUnits) { - SU->dump(DAG); - } + for (const SUnit* SU : SUnits) + DAG->dumpNode(*SU); } dbgs() << "///////////////////////\n"; @@ -1933,7 +1931,7 @@ void SIScheduleDAGMI::schedule() LLVM_DEBUG(dbgs() << "Preparing Scheduling\n"); buildDAGWithRegPressure(); - LLVM_DEBUG(for (SUnit &SU : SUnits) SU.dumpAll(this)); + LLVM_DEBUG(dump()); topologicalSort(); findRootsAndBiasEdges(TopRoots, BotRoots); @@ -1957,12 +1955,12 @@ void SIScheduleDAGMI::schedule() for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { SUnit *SU = &SUnits[i]; - unsigned BaseLatReg; + MachineOperand *BaseLatOp; int64_t OffLatReg; if (SITII->isLowLatencyInstruction(*SU->getInstr())) { IsLowLatencySU[i] = 1; - if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg, - TRI)) + if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg, + TRI)) LowLatencyOffset[i] = OffLatReg; } else if (SITII->isHighLatencyInstruction(*SU->getInstr())) IsHighLatencySU[i] = 1; diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 938cdaf1ef8fb..b4a4e9e33133d 100644 --- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -202,8 +202,6 @@ public: class SIMemOpAccess final { private: - - AMDGPUAS SIAddrSpaceInfo; AMDGPUMachineModuleInfo *MMI = nullptr; /// Reports unsupported message \p Msg for \p MI to LLVM context. @@ -255,7 +253,7 @@ protected: /// Instruction info. const SIInstrInfo *TII = nullptr; - IsaInfo::IsaVersion IV; + IsaVersion IV; SICacheControl(const GCNSubtarget &ST); @@ -453,22 +451,21 @@ SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID, } SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const { - if (AS == SIAddrSpaceInfo.FLAT_ADDRESS) + if (AS == AMDGPUAS::FLAT_ADDRESS) return SIAtomicAddrSpace::FLAT; - if (AS == SIAddrSpaceInfo.GLOBAL_ADDRESS) + if (AS == AMDGPUAS::GLOBAL_ADDRESS) return SIAtomicAddrSpace::GLOBAL; - if (AS == SIAddrSpaceInfo.LOCAL_ADDRESS) + if (AS == AMDGPUAS::LOCAL_ADDRESS) return SIAtomicAddrSpace::LDS; - if (AS == SIAddrSpaceInfo.PRIVATE_ADDRESS) + if (AS == AMDGPUAS::PRIVATE_ADDRESS) return SIAtomicAddrSpace::SCRATCH; - if (AS == SIAddrSpaceInfo.REGION_ADDRESS) + if (AS == AMDGPUAS::REGION_ADDRESS) return SIAtomicAddrSpace::GDS; return SIAtomicAddrSpace::OTHER; } SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) { - SIAddrSpaceInfo = getAMDGPUAS(MF.getTarget()); MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); } @@ -608,7 +605,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo( SICacheControl::SICacheControl(const GCNSubtarget &ST) { TII = ST.getInstrInfo(); - IV = IsaInfo::getIsaVersion(ST.getFeatureBits()); + IV = getIsaVersion(ST.getCPU()); } /* static */ @@ -815,6 +812,12 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); + const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>(); + + const unsigned Flush = STM.isAmdPalOS() || STM.isMesa3DOS() + ? AMDGPU::BUFFER_WBINVL1 + : AMDGPU::BUFFER_WBINVL1_VOL; + if (Pos == Position::AFTER) ++MI; @@ -822,7 +825,7 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI, switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: - BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1_VOL)); + BuildMI(MBB, MI, DL, TII->get(Flush)); Changed = true; break; case SIAtomicScope::WORKGROUP: diff --git a/lib/Target/AMDGPU/SIModeRegister.cpp b/lib/Target/AMDGPU/SIModeRegister.cpp new file mode 100644 index 0000000000000..883fd308f2f4b --- /dev/null +++ b/lib/Target/AMDGPU/SIModeRegister.cpp @@ -0,0 +1,406 @@ +//===-- SIModeRegister.cpp - Mode Register --------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This pass inserts changes to the Mode register settings as required. +/// Note that currently it only deals with the Double Precision Floating Point +/// rounding mode setting, but is intended to be generic enough to be easily +/// expanded. +/// +//===----------------------------------------------------------------------===// +// +#include "AMDGPU.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include <queue> + +#define DEBUG_TYPE "si-mode-register" + +STATISTIC(NumSetregInserted, "Number of setreg of mode register inserted."); + +using namespace llvm; + +struct Status { + // Mask is a bitmask where a '1' indicates the corresponding Mode bit has a + // known value + unsigned Mask; + unsigned Mode; + + Status() : Mask(0), Mode(0){}; + + Status(unsigned Mask, unsigned Mode) : Mask(Mask), Mode(Mode) { + Mode &= Mask; + }; + + // merge two status values such that only values that don't conflict are + // preserved + Status merge(const Status &S) const { + return Status((Mask | S.Mask), ((Mode & ~S.Mask) | (S.Mode & S.Mask))); + } + + // merge an unknown value by using the unknown value's mask to remove bits + // from the result + Status mergeUnknown(unsigned newMask) { + return Status(Mask & ~newMask, Mode & ~newMask); + } + + // intersect two Status values to produce a mode and mask that is a subset + // of both values + Status intersect(const Status &S) const { + unsigned NewMask = (Mask & S.Mask) & (Mode ^ ~S.Mode); + unsigned NewMode = (Mode & NewMask); + return Status(NewMask, NewMode); + } + + // produce the delta required to change the Mode to the required Mode + Status delta(const Status &S) const { + return Status((S.Mask & (Mode ^ S.Mode)) | (~Mask & S.Mask), S.Mode); + } + + bool operator==(const Status &S) const { + return (Mask == S.Mask) && (Mode == S.Mode); + } + + bool operator!=(const Status &S) const { return !(*this == S); } + + bool isCompatible(Status &S) { + return ((Mask & S.Mask) == S.Mask) && ((Mode & S.Mask) == S.Mode); + } + + bool isCombinable(Status &S) { + return !(Mask & S.Mask) || isCompatible(S); + } +}; + +class BlockData { +public: + // The Status that represents the mode register settings required by the + // FirstInsertionPoint (if any) in this block. Calculated in Phase 1. + Status Require; + + // The Status that represents the net changes to the Mode register made by + // this block, Calculated in Phase 1. + Status Change; + + // The Status that represents the mode register settings on exit from this + // block. Calculated in Phase 2. + Status Exit; + + // The Status that represents the intersection of exit Mode register settings + // from all predecessor blocks. Calculated in Phase 2, and used by Phase 3. + Status Pred; + + // In Phase 1 we record the first instruction that has a mode requirement, + // which is used in Phase 3 if we need to insert a mode change. + MachineInstr *FirstInsertionPoint; + + BlockData() : FirstInsertionPoint(nullptr) {}; +}; + +namespace { + +class SIModeRegister : public MachineFunctionPass { +public: + static char ID; + + std::vector<std::unique_ptr<BlockData>> BlockInfo; + std::queue<MachineBasicBlock *> Phase2List; + + // The default mode register setting currently only caters for the floating + // point double precision rounding mode. + // We currently assume the default rounding mode is Round to Nearest + // NOTE: this should come from a per function rounding mode setting once such + // a setting exists. + unsigned DefaultMode = FP_ROUND_ROUND_TO_NEAREST; + Status DefaultStatus = + Status(FP_ROUND_MODE_DP(0x3), FP_ROUND_MODE_DP(DefaultMode)); + +public: + SIModeRegister() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + void processBlockPhase1(MachineBasicBlock &MBB, const SIInstrInfo *TII); + + void processBlockPhase2(MachineBasicBlock &MBB, const SIInstrInfo *TII); + + void processBlockPhase3(MachineBasicBlock &MBB, const SIInstrInfo *TII); + + Status getInstructionMode(MachineInstr &MI, const SIInstrInfo *TII); + + void insertSetreg(MachineBasicBlock &MBB, MachineInstr *I, + const SIInstrInfo *TII, Status InstrMode); +}; +} // End anonymous namespace. + +INITIALIZE_PASS(SIModeRegister, DEBUG_TYPE, + "Insert required mode register values", false, false) + +char SIModeRegister::ID = 0; + +char &llvm::SIModeRegisterID = SIModeRegister::ID; + +FunctionPass *llvm::createSIModeRegisterPass() { return new SIModeRegister(); } + +// Determine the Mode register setting required for this instruction. +// Instructions which don't use the Mode register return a null Status. +// Note this currently only deals with instructions that use the floating point +// double precision setting. +Status SIModeRegister::getInstructionMode(MachineInstr &MI, + const SIInstrInfo *TII) { + if (TII->usesFPDPRounding(MI)) { + switch (MI.getOpcode()) { + case AMDGPU::V_INTERP_P1LL_F16: + case AMDGPU::V_INTERP_P1LV_F16: + case AMDGPU::V_INTERP_P2_F16: + // f16 interpolation instructions need double precision round to zero + return Status(FP_ROUND_MODE_DP(3), + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_ZERO)); + default: + return DefaultStatus; + } + } + return Status(); +} + +// Insert a setreg instruction to update the Mode register. +// It is possible (though unlikely) for an instruction to require a change to +// the value of disjoint parts of the Mode register when we don't know the +// value of the intervening bits. In that case we need to use more than one +// setreg instruction. +void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI, + const SIInstrInfo *TII, Status InstrMode) { + while (InstrMode.Mask) { + unsigned Offset = countTrailingZeros<unsigned>(InstrMode.Mask); + unsigned Width = countTrailingOnes<unsigned>(InstrMode.Mask >> Offset); + unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1); + BuildMI(MBB, MI, 0, TII->get(AMDGPU::S_SETREG_IMM32_B32)) + .addImm(Value) + .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) | + (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) | + (AMDGPU::Hwreg::ID_MODE << AMDGPU::Hwreg::ID_SHIFT_)); + ++NumSetregInserted; + InstrMode.Mask &= ~(((1 << Width) - 1) << Offset); + } +} + +// In Phase 1 we iterate through the instructions of the block and for each +// instruction we get its mode usage. If the instruction uses the Mode register +// we: +// - update the Change status, which tracks the changes to the Mode register +// made by this block +// - if this instruction's requirements are compatible with the current setting +// of the Mode register we merge the modes +// - if it isn't compatible and an InsertionPoint isn't set, then we set the +// InsertionPoint to the current instruction, and we remember the current +// mode +// - if it isn't compatible and InsertionPoint is set we insert a seteg before +// that instruction (unless this instruction forms part of the block's +// entry requirements in which case the insertion is deferred until Phase 3 +// when predecessor exit values are known), and move the insertion point to +// this instruction +// - if this is a setreg instruction we treat it as an incompatible instruction. +// This is sub-optimal but avoids some nasty corner cases, and is expected to +// occur very rarely. +// - on exit we have set the Require, Change, and initial Exit modes. +void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB, + const SIInstrInfo *TII) { + auto NewInfo = llvm::make_unique<BlockData>(); + MachineInstr *InsertionPoint = nullptr; + // RequirePending is used to indicate whether we are collecting the initial + // requirements for the block, and need to defer the first InsertionPoint to + // Phase 3. It is set to false once we have set FirstInsertionPoint, or when + // we discover an explict setreg that means this block doesn't have any + // initial requirements. + bool RequirePending = true; + Status IPChange; + for (MachineInstr &MI : MBB) { + Status InstrMode = getInstructionMode(MI, TII); + if ((MI.getOpcode() == AMDGPU::S_SETREG_B32) || + (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32)) { + // We preserve any explicit mode register setreg instruction we encounter, + // as we assume it has been inserted by a higher authority (this is + // likely to be a very rare occurrence). + unsigned Dst = TII->getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm(); + if (((Dst & AMDGPU::Hwreg::ID_MASK_) >> AMDGPU::Hwreg::ID_SHIFT_) != + AMDGPU::Hwreg::ID_MODE) + continue; + + unsigned Width = ((Dst & AMDGPU::Hwreg::WIDTH_M1_MASK_) >> + AMDGPU::Hwreg::WIDTH_M1_SHIFT_) + + 1; + unsigned Offset = + (Dst & AMDGPU::Hwreg::OFFSET_MASK_) >> AMDGPU::Hwreg::OFFSET_SHIFT_; + unsigned Mask = ((1 << Width) - 1) << Offset; + + // If an InsertionPoint is set we will insert a setreg there. + if (InsertionPoint) { + insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change)); + InsertionPoint = nullptr; + } + // If this is an immediate then we know the value being set, but if it is + // not an immediate then we treat the modified bits of the mode register + // as unknown. + if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32) { + unsigned Val = TII->getNamedOperand(MI, AMDGPU::OpName::imm)->getImm(); + unsigned Mode = (Val << Offset) & Mask; + Status Setreg = Status(Mask, Mode); + // If we haven't already set the initial requirements for the block we + // don't need to as the requirements start from this explicit setreg. + RequirePending = false; + NewInfo->Change = NewInfo->Change.merge(Setreg); + } else { + NewInfo->Change = NewInfo->Change.mergeUnknown(Mask); + } + } else if (!NewInfo->Change.isCompatible(InstrMode)) { + // This instruction uses the Mode register and its requirements aren't + // compatible with the current mode. + if (InsertionPoint) { + // If the required mode change cannot be included in the current + // InsertionPoint changes, we need a setreg and start a new + // InsertionPoint. + if (!IPChange.delta(NewInfo->Change).isCombinable(InstrMode)) { + if (RequirePending) { + // This is the first insertionPoint in the block so we will defer + // the insertion of the setreg to Phase 3 where we know whether or + // not it is actually needed. + NewInfo->FirstInsertionPoint = InsertionPoint; + NewInfo->Require = NewInfo->Change; + RequirePending = false; + } else { + insertSetreg(MBB, InsertionPoint, TII, + IPChange.delta(NewInfo->Change)); + IPChange = NewInfo->Change; + } + // Set the new InsertionPoint + InsertionPoint = &MI; + } + NewInfo->Change = NewInfo->Change.merge(InstrMode); + } else { + // No InsertionPoint is currently set - this is either the first in + // the block or we have previously seen an explicit setreg. + InsertionPoint = &MI; + IPChange = NewInfo->Change; + NewInfo->Change = NewInfo->Change.merge(InstrMode); + } + } + } + if (RequirePending) { + // If we haven't yet set the initial requirements for the block we set them + // now. + NewInfo->FirstInsertionPoint = InsertionPoint; + NewInfo->Require = NewInfo->Change; + } else if (InsertionPoint) { + // We need to insert a setreg at the InsertionPoint + insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change)); + } + NewInfo->Exit = NewInfo->Change; + BlockInfo[MBB.getNumber()] = std::move(NewInfo); +} + +// In Phase 2 we revisit each block and calculate the common Mode register +// value provided by all predecessor blocks. If the Exit value for the block +// is changed, then we add the successor blocks to the worklist so that the +// exit value is propagated. +void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB, + const SIInstrInfo *TII) { +// BlockData *BI = BlockInfo[MBB.getNumber()]; + unsigned ThisBlock = MBB.getNumber(); + if (MBB.pred_empty()) { + // There are no predecessors, so use the default starting status. + BlockInfo[ThisBlock]->Pred = DefaultStatus; + } else { + // Build a status that is common to all the predecessors by intersecting + // all the predecessor exit status values. + MachineBasicBlock::pred_iterator P = MBB.pred_begin(), E = MBB.pred_end(); + MachineBasicBlock &PB = *(*P); + BlockInfo[ThisBlock]->Pred = BlockInfo[PB.getNumber()]->Exit; + + for (P = std::next(P); P != E; P = std::next(P)) { + MachineBasicBlock *Pred = *P; + BlockInfo[ThisBlock]->Pred = BlockInfo[ThisBlock]->Pred.intersect(BlockInfo[Pred->getNumber()]->Exit); + } + } + Status TmpStatus = BlockInfo[ThisBlock]->Pred.merge(BlockInfo[ThisBlock]->Change); + if (BlockInfo[ThisBlock]->Exit != TmpStatus) { + BlockInfo[ThisBlock]->Exit = TmpStatus; + // Add the successors to the work list so we can propagate the changed exit + // status. + for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(), + E = MBB.succ_end(); + S != E; S = std::next(S)) { + MachineBasicBlock &B = *(*S); + Phase2List.push(&B); + } + } +} + +// In Phase 3 we revisit each block and if it has an insertion point defined we +// check whether the predecessor mode meets the block's entry requirements. If +// not we insert an appropriate setreg instruction to modify the Mode register. +void SIModeRegister::processBlockPhase3(MachineBasicBlock &MBB, + const SIInstrInfo *TII) { +// BlockData *BI = BlockInfo[MBB.getNumber()]; + unsigned ThisBlock = MBB.getNumber(); + if (!BlockInfo[ThisBlock]->Pred.isCompatible(BlockInfo[ThisBlock]->Require)) { + Status Delta = BlockInfo[ThisBlock]->Pred.delta(BlockInfo[ThisBlock]->Require); + if (BlockInfo[ThisBlock]->FirstInsertionPoint) + insertSetreg(MBB, BlockInfo[ThisBlock]->FirstInsertionPoint, TII, Delta); + else + insertSetreg(MBB, &MBB.instr_front(), TII, Delta); + } +} + +bool SIModeRegister::runOnMachineFunction(MachineFunction &MF) { + BlockInfo.resize(MF.getNumBlockIDs()); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + // Processing is performed in a number of phases + + // Phase 1 - determine the initial mode required by each block, and add setreg + // instructions for intra block requirements. + for (MachineBasicBlock &BB : MF) + processBlockPhase1(BB, TII); + + // Phase 2 - determine the exit mode from each block. We add all blocks to the + // list here, but will also add any that need to be revisited during Phase 2 + // processing. + for (MachineBasicBlock &BB : MF) + Phase2List.push(&BB); + while (!Phase2List.empty()) { + processBlockPhase2(*Phase2List.front(), TII); + Phase2List.pop(); + } + + // Phase 3 - add an initial setreg to each block where the required entry mode + // is not satisfied by the exit mode of all its predecessors. + for (MachineBasicBlock &BB : MF) + processBlockPhase3(BB, TII); + + BlockInfo.clear(); + + return NumSetregInserted > 0; +} diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp index 7b678d12ba818..c671fed34bdf1 100644 --- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp +++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -103,6 +103,122 @@ static MachineInstr* getOrExecSource(const MachineInstr &MI, return SaveExecInst; } +// Optimize sequence +// %sel = V_CNDMASK_B32_e64 0, 1, %cc +// %cmp = V_CMP_NE_U32 1, %1 +// $vcc = S_AND_B64 $exec, %cmp +// S_CBRANCH_VCC[N]Z +// => +// $vcc = S_ANDN2_B64 $exec, %cc +// S_CBRANCH_VCC[N]Z +// +// It is the negation pattern inserted by DAGCombiner::visitBRCOND() in the +// rebuildSetCC(). We start with S_CBRANCH to avoid exhaustive search, but +// only 3 first instructions are really needed. S_AND_B64 with exec is a +// required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive +// lanes. +// +// Returns %cc register on success. +static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB, + const GCNSubtarget &ST, + MachineRegisterInfo &MRI, + LiveIntervals *LIS) { + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const unsigned AndOpc = AMDGPU::S_AND_B64; + const unsigned Andn2Opc = AMDGPU::S_ANDN2_B64; + const unsigned CondReg = AMDGPU::VCC; + const unsigned ExecReg = AMDGPU::EXEC; + + auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + return Opc == AMDGPU::S_CBRANCH_VCCZ || + Opc == AMDGPU::S_CBRANCH_VCCNZ; }); + if (I == MBB.terminators().end()) + return AMDGPU::NoRegister; + + auto *And = TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister, + *I, MRI, LIS); + if (!And || And->getOpcode() != AndOpc || + !And->getOperand(1).isReg() || !And->getOperand(2).isReg()) + return AMDGPU::NoRegister; + + MachineOperand *AndCC = &And->getOperand(1); + unsigned CmpReg = AndCC->getReg(); + unsigned CmpSubReg = AndCC->getSubReg(); + if (CmpReg == ExecReg) { + AndCC = &And->getOperand(2); + CmpReg = AndCC->getReg(); + CmpSubReg = AndCC->getSubReg(); + } else if (And->getOperand(2).getReg() != ExecReg) { + return AMDGPU::NoRegister; + } + + auto *Cmp = TRI->findReachingDef(CmpReg, CmpSubReg, *And, MRI, LIS); + if (!Cmp || !(Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e32 || + Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e64) || + Cmp->getParent() != And->getParent()) + return AMDGPU::NoRegister; + + MachineOperand *Op1 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src0); + MachineOperand *Op2 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src1); + if (Op1->isImm() && Op2->isReg()) + std::swap(Op1, Op2); + if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1) + return AMDGPU::NoRegister; + + unsigned SelReg = Op1->getReg(); + auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, MRI, LIS); + if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64) + return AMDGPU::NoRegister; + + Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0); + Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1); + MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2); + if (!Op1->isImm() || !Op2->isImm() || !CC->isReg() || + Op1->getImm() != 0 || Op2->getImm() != 1) + return AMDGPU::NoRegister; + + LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' + << *Cmp << '\t' << *And); + + unsigned CCReg = CC->getReg(); + LIS->RemoveMachineInstrFromMaps(*And); + MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(), + TII->get(Andn2Opc), And->getOperand(0).getReg()) + .addReg(ExecReg) + .addReg(CCReg, CC->getSubReg()); + And->eraseFromParent(); + LIS->InsertMachineInstrInMaps(*Andn2); + + LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2 << '\n'); + + // Try to remove compare. Cmp value should not used in between of cmp + // and s_and_b64 if VCC or just unused if any other register. + if ((TargetRegisterInfo::isVirtualRegister(CmpReg) && + MRI.use_nodbg_empty(CmpReg)) || + (CmpReg == CondReg && + std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(), + [&](const MachineInstr &MI) { + return MI.readsRegister(CondReg, TRI); }))) { + LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n'); + + LIS->RemoveMachineInstrFromMaps(*Cmp); + Cmp->eraseFromParent(); + + // Try to remove v_cndmask_b32. + if (TargetRegisterInfo::isVirtualRegister(SelReg) && + MRI.use_nodbg_empty(SelReg)) { + LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n'); + + LIS->RemoveMachineInstrFromMaps(*Sel); + Sel->eraseFromParent(); + } + } + + return CCReg; +} + bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -117,9 +233,24 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock &MBB : MF) { + if (unsigned Reg = optimizeVcndVcmpPair(MBB, ST, MRI, LIS)) { + RecalcRegs.insert(Reg); + RecalcRegs.insert(AMDGPU::VCC_LO); + RecalcRegs.insert(AMDGPU::VCC_HI); + RecalcRegs.insert(AMDGPU::SCC); + Changed = true; + } + // Try to remove unneeded instructions before s_endpgm. if (MBB.succ_empty()) { - if (MBB.empty() || MBB.back().getOpcode() != AMDGPU::S_ENDPGM) + if (MBB.empty()) + continue; + + // Skip this if the endpgm has any implicit uses, otherwise we would need + // to be careful to update / remove them. + MachineInstr &Term = MBB.back(); + if (Term.getOpcode() != AMDGPU::S_ENDPGM || + Term.getNumOperands() != 0) continue; SmallVector<MachineBasicBlock*, 4> Blocks({&MBB}); diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index 0e000b72962eb..2d43d5d05ef64 100644 --- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -90,7 +90,9 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; void matchSDWAOperands(MachineBasicBlock &MBB); std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); - bool isConvertibleToSDWA(const MachineInstr &MI, const GCNSubtarget &ST) const; + bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const; + void pseudoOpConvertToVOP2(MachineInstr &MI, + const GCNSubtarget &ST) const; bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; @@ -854,7 +856,82 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { } } -bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, +// Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and +// V_ADD_I32_e64 into V_ADD_I32_e32. This allows isConvertibleToSDWA +// to perform its transformation on V_ADD_I32_e32 into V_ADD_I32_sdwa. +// +// We are transforming from a VOP3 into a VOP2 form of the instruction. +// %19:vgpr_32 = V_AND_B32_e32 255, +// killed %16:vgpr_32, implicit $exec +// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64 +// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec +// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 +// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec +// +// becomes +// %47:vgpr_32 = V_ADD_I32_sdwa +// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, +// implicit-def $vcc, implicit $exec +// %48:vgpr_32 = V_ADDC_U32_e32 +// 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec +void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, + const GCNSubtarget &ST) const { + int Opc = MI.getOpcode(); + assert((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64) && + "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64"); + + // Can the candidate MI be shrunk? + if (!TII->canShrink(MI, *MRI)) + return; + Opc = AMDGPU::getVOPe32(Opc); + // Find the related ADD instruction. + const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); + if (!Sdst) + return; + MachineOperand *NextOp = findSingleRegUse(Sdst, MRI); + if (!NextOp) + return; + MachineInstr &MISucc = *NextOp->getParent(); + // Can the successor be shrunk? + if (!TII->canShrink(MISucc, *MRI)) + return; + int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode()); + // Make sure the carry in/out are subsequently unused. + MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2); + if (!CarryIn) + return; + MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst); + if (!CarryOut) + return; + if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg())) + return; + // Make sure VCC or its subregs are dead before MI. + MachineBasicBlock &MBB = *MI.getParent(); + auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25); + if (Liveness != MachineBasicBlock::LQR_Dead) + return; + // Check if VCC is referenced in range of (MI,MISucc]. + for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator(); + I != E; ++I) { + if (I->modifiesRegister(AMDGPU::VCC, TRI)) + return; + } + // Make the two new e32 instruction variants. + // Replace MI with V_{SUB|ADD}_I32_e32 + auto NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc)); + NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)); + NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); + NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)); + MI.eraseFromParent(); + // Replace MISucc with V_{SUBB|ADDC}_U32_e32 + auto NewInst = BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc)); + NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst)); + NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0)); + NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1)); + MISucc.eraseFromParent(); +} + +bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const { // Check if this is already an SDWA instruction unsigned Opc = MI.getOpcode(); @@ -1127,6 +1204,22 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock &MBB : MF) { bool Changed = false; do { + // Preprocess the ADD/SUB pairs so they could be SDWA'ed. + // Look for a possible ADD or SUB that resulted from a previously lowered + // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 + // lowers the pair of instructions into e32 form. + matchSDWAOperands(MBB); + for (const auto &OperandPair : SDWAOperands) { + const auto &Operand = OperandPair.second; + MachineInstr *PotentialMI = Operand->potentialToConvert(TII); + if (PotentialMI && + (PotentialMI->getOpcode() == AMDGPU::V_ADD_I32_e64 || + PotentialMI->getOpcode() == AMDGPU::V_SUB_I32_e64)) + pseudoOpConvertToVOP2(*PotentialMI, ST); + } + SDWAOperands.clear(); + + // Generate potential match list. matchSDWAOperands(MBB); for (const auto &OperandPair : SDWAOperands) { diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 624607f6ea542..97cfde2b23541 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -18,9 +18,12 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/CodeGen/SlotIndexes.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" @@ -495,15 +498,16 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, return false; const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); - MachineInstrBuilder NewMI = BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) - .add(*Reg) - .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) - .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MachineInstrBuilder NewMI = + BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) + .add(*Reg) + .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .cloneMemRefs(*MI); const MachineOperand *VDataIn = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata_in); @@ -900,7 +904,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, .addImm(0) // glc .addMemOperand(MMO); - if (NumSubRegs > 1) + if (NumSubRegs > 1 && i == 0) MIB.addReg(SuperReg, RegState::ImplicitDefine); continue; @@ -914,7 +918,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, .addReg(Spill.VGPR) .addImm(Spill.Lane); - if (NumSubRegs > 1) + if (NumSubRegs > 1 && i == 0) MIB.addReg(SuperReg, RegState::ImplicitDefine); } else { if (OnlyToVGPR) @@ -1598,3 +1602,57 @@ SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, llvm_unreachable("not implemented"); } } + +// Find reaching register definition +MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg, + MachineInstr &Use, + MachineRegisterInfo &MRI, + LiveIntervals *LIS) const { + auto &MDT = LIS->getAnalysis<MachineDominatorTree>(); + SlotIndex UseIdx = LIS->getInstructionIndex(Use); + SlotIndex DefIdx; + + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (!LIS->hasInterval(Reg)) + return nullptr; + LiveInterval &LI = LIS->getInterval(Reg); + LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg) + : MRI.getMaxLaneMaskForVReg(Reg); + VNInfo *V = nullptr; + if (LI.hasSubRanges()) { + for (auto &S : LI.subranges()) { + if ((S.LaneMask & SubLanes) == SubLanes) { + V = S.getVNInfoAt(UseIdx); + break; + } + } + } else { + V = LI.getVNInfoAt(UseIdx); + } + if (!V) + return nullptr; + DefIdx = V->def; + } else { + // Find last def. + for (MCRegUnitIterator Units(Reg, this); Units.isValid(); ++Units) { + LiveRange &LR = LIS->getRegUnit(*Units); + if (VNInfo *V = LR.getVNInfoAt(UseIdx)) { + if (!DefIdx.isValid() || + MDT.dominates(LIS->getInstructionFromIndex(DefIdx), + LIS->getInstructionFromIndex(V->def))) + DefIdx = V->def; + } else { + return nullptr; + } + } + } + + MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx); + + if (!Def || !MDT.dominates(Def, &Use)) + return nullptr; + + assert(Def->modifiesRegister(Reg, this)); + + return Def; +} diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index 5a51b67ca719c..b82fefde47e13 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -228,6 +228,12 @@ public: getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override; + // Find reaching register definition + MachineInstr *findReachingDef(unsigned Reg, unsigned SubReg, + MachineInstr &Use, + MachineRegisterInfo &MRI, + LiveIntervals *LIS) const; + private: void buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index f87a0763b353b..c625ecc9b750e 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -435,7 +435,7 @@ def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, let AllocationPriority = 7; } -def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> { +def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> { let CopyCost = 1; let AllocationPriority = 8; } @@ -444,13 +444,13 @@ def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add let isAllocatable = 0; } -def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32, +def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> { let CopyCost = 1; let AllocationPriority = 8; } -def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32, +def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32, (add SReg_64_XEXEC, EXEC)> { let CopyCost = 1; let AllocationPriority = 8; @@ -459,15 +459,15 @@ def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32, // Requires 2 s_mov_b64 to copy let CopyCost = 2 in { -def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)> { +def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add SGPR_128Regs)> { let AllocationPriority = 10; } -def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128Regs)> { +def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32, (add TTMP_128Regs)> { let isAllocatable = 0; } -def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64, v2f64], 32, +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add SGPR_128, TTMP_128)> { let AllocationPriority = 10; } diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 4189bcce52ea1..6ad7dd0e3a7ce 100644 --- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -64,59 +64,6 @@ FunctionPass *llvm::createSIShrinkInstructionsPass() { return new SIShrinkInstructions(); } -static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, - const SIRegisterInfo &TRI, - const MachineRegisterInfo &MRI) { - - const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); - // Can't shrink instruction with three operands. - // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add - // a special case for it. It can only be shrunk if the third operand - // is vcc. We should handle this the same way we handle vopc, by addding - // a register allocation hint pre-regalloc and then do the shrinking - // post-regalloc. - if (Src2) { - switch (MI.getOpcode()) { - default: return false; - - case AMDGPU::V_ADDC_U32_e64: - case AMDGPU::V_SUBB_U32_e64: - case AMDGPU::V_SUBBREV_U32_e64: { - const MachineOperand *Src1 - = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - if (!Src1->isReg() || !TRI.isVGPR(MRI, Src1->getReg())) - return false; - // Additional verification is needed for sdst/src2. - return true; - } - case AMDGPU::V_MAC_F32_e64: - case AMDGPU::V_MAC_F16_e64: - case AMDGPU::V_FMAC_F32_e64: - if (!Src2->isReg() || !TRI.isVGPR(MRI, Src2->getReg()) || - TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers)) - return false; - break; - - case AMDGPU::V_CNDMASK_B32_e64: - break; - } - } - - const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - if (Src1 && (!Src1->isReg() || !TRI.isVGPR(MRI, Src1->getReg()) || - TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) - return false; - - // We don't need to check src0, all input types are legal, so just make sure - // src0 isn't using any modifiers. - if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers)) - return false; - - // Check output modifiers - return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) && - !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp); -} - /// This function checks \p MI for operands defined by a move immediate /// instruction and then folds the literal constant into the instruction if it /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. @@ -173,19 +120,6 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, return false; } -// Copy MachineOperand with all flags except setting it as implicit. -static void copyFlagsToImplicitVCC(MachineInstr &MI, - const MachineOperand &Orig) { - - for (MachineOperand &Use : MI.implicit_operands()) { - if (Use.isUse() && Use.getReg() == AMDGPU::VCC) { - Use.setIsUndef(Orig.isUndef()); - Use.setIsKill(Orig.isKill()); - return; - } - } -} - static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { return isInt<16>(Src.getImm()) && !TII->isInlineConstant(*Src.getParent(), @@ -278,6 +212,245 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { } } +/// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals. +/// For AND or OR, try using S_BITSET{0,1} to clear or set bits. +/// If the inverse of the immediate is legal, use ANDN2, ORN2 or +/// XNOR (as a ^ b == ~(a ^ ~b)). +/// \returns true if the caller should continue the machine function iterator +static bool shrinkScalarLogicOp(const GCNSubtarget &ST, + MachineRegisterInfo &MRI, + const SIInstrInfo *TII, + MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + const MachineOperand *Dest = &MI.getOperand(0); + MachineOperand *Src0 = &MI.getOperand(1); + MachineOperand *Src1 = &MI.getOperand(2); + MachineOperand *SrcReg = Src0; + MachineOperand *SrcImm = Src1; + + if (SrcImm->isImm() && + !AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) { + uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm()); + uint32_t NewImm = 0; + + if (Opc == AMDGPU::S_AND_B32) { + if (isPowerOf2_32(~Imm)) { + NewImm = countTrailingOnes(Imm); + Opc = AMDGPU::S_BITSET0_B32; + } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + NewImm = ~Imm; + Opc = AMDGPU::S_ANDN2_B32; + } + } else if (Opc == AMDGPU::S_OR_B32) { + if (isPowerOf2_32(Imm)) { + NewImm = countTrailingZeros(Imm); + Opc = AMDGPU::S_BITSET1_B32; + } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + NewImm = ~Imm; + Opc = AMDGPU::S_ORN2_B32; + } + } else if (Opc == AMDGPU::S_XOR_B32) { + if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + NewImm = ~Imm; + Opc = AMDGPU::S_XNOR_B32; + } + } else { + llvm_unreachable("unexpected opcode"); + } + + if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) && + SrcImm == Src0) { + if (!TII->commuteInstruction(MI, false, 1, 2)) + NewImm = 0; + } + + if (NewImm != 0) { + if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) && + SrcReg->isReg()) { + MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); + MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); + return true; + } + + if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { + MI.setDesc(TII->get(Opc)); + if (Opc == AMDGPU::S_BITSET0_B32 || + Opc == AMDGPU::S_BITSET1_B32) { + Src0->ChangeToImmediate(NewImm); + MI.RemoveOperand(2); + } else { + SrcImm->setImm(NewImm); + } + } + } + } + + return false; +} + +// This is the same as MachineInstr::readsRegister/modifiesRegister except +// it takes subregs into account. +static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R, + unsigned Reg, unsigned SubReg, + const SIRegisterInfo &TRI) { + for (const MachineOperand &MO : R) { + if (!MO.isReg()) + continue; + + if (TargetRegisterInfo::isPhysicalRegister(Reg) && + TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { + if (TRI.regsOverlap(Reg, MO.getReg())) + return true; + } else if (MO.getReg() == Reg && + TargetRegisterInfo::isVirtualRegister(Reg)) { + LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) & + TRI.getSubRegIndexLaneMask(MO.getSubReg()); + if (Overlap.any()) + return true; + } + } + return false; +} + +static bool instReadsReg(const MachineInstr *MI, + unsigned Reg, unsigned SubReg, + const SIRegisterInfo &TRI) { + return instAccessReg(MI->uses(), Reg, SubReg, TRI); +} + +static bool instModifiesReg(const MachineInstr *MI, + unsigned Reg, unsigned SubReg, + const SIRegisterInfo &TRI) { + return instAccessReg(MI->defs(), Reg, SubReg, TRI); +} + +static TargetInstrInfo::RegSubRegPair +getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I, + const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) { + if (TRI.getRegSizeInBits(Reg, MRI) != 32) { + if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I)); + } else { + LaneBitmask LM = TRI.getSubRegIndexLaneMask(Sub); + Sub = TRI.getSubRegFromChannel(I + countTrailingZeros(LM.getAsInteger())); + } + } + return TargetInstrInfo::RegSubRegPair(Reg, Sub); +} + +// Match: +// mov t, x +// mov x, y +// mov y, t +// +// => +// +// mov t, x (t is potentially dead and move eliminated) +// v_swap_b32 x, y +// +// Returns next valid instruction pointer if was able to create v_swap_b32. +// +// This shall not be done too early not to prevent possible folding which may +// remove matched moves, and this should prefereably be done before RA to +// release saved registers and also possibly after RA which can insert copies +// too. +// +// This is really just a generic peephole that is not a canocical shrinking, +// although requirements match the pass placement and it reduces code size too. +static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI, + const SIInstrInfo *TII) { + assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 || + MovT.getOpcode() == AMDGPU::COPY); + + unsigned T = MovT.getOperand(0).getReg(); + unsigned Tsub = MovT.getOperand(0).getSubReg(); + MachineOperand &Xop = MovT.getOperand(1); + + if (!Xop.isReg()) + return nullptr; + unsigned X = Xop.getReg(); + unsigned Xsub = Xop.getSubReg(); + + unsigned Size = TII->getOpSize(MovT, 0) / 4; + + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + if (!TRI.isVGPR(MRI, X)) + return nullptr; + + for (MachineOperand &YTop : MRI.use_nodbg_operands(T)) { + if (YTop.getSubReg() != Tsub) + continue; + + MachineInstr &MovY = *YTop.getParent(); + if ((MovY.getOpcode() != AMDGPU::V_MOV_B32_e32 && + MovY.getOpcode() != AMDGPU::COPY) || + MovY.getOperand(1).getSubReg() != Tsub) + continue; + + unsigned Y = MovY.getOperand(0).getReg(); + unsigned Ysub = MovY.getOperand(0).getSubReg(); + + if (!TRI.isVGPR(MRI, Y) || MovT.getParent() != MovY.getParent()) + continue; + + MachineInstr *MovX = nullptr; + auto I = std::next(MovT.getIterator()), E = MovT.getParent()->instr_end(); + for (auto IY = MovY.getIterator(); I != E && I != IY; ++I) { + if (instReadsReg(&*I, X, Xsub, TRI) || + instModifiesReg(&*I, Y, Ysub, TRI) || + instModifiesReg(&*I, T, Tsub, TRI) || + (MovX && instModifiesReg(&*I, X, Xsub, TRI))) { + MovX = nullptr; + break; + } + if (!instReadsReg(&*I, Y, Ysub, TRI)) { + if (!MovX && instModifiesReg(&*I, X, Xsub, TRI)) { + MovX = nullptr; + break; + } + continue; + } + if (MovX || + (I->getOpcode() != AMDGPU::V_MOV_B32_e32 && + I->getOpcode() != AMDGPU::COPY) || + I->getOperand(0).getReg() != X || + I->getOperand(0).getSubReg() != Xsub) { + MovX = nullptr; + break; + } + MovX = &*I; + } + + if (!MovX || I == E) + continue; + + LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << MovY); + + for (unsigned I = 0; I < Size; ++I) { + TargetInstrInfo::RegSubRegPair X1, Y1; + X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI); + Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI); + BuildMI(*MovT.getParent(), MovX->getIterator(), MovT.getDebugLoc(), + TII->get(AMDGPU::V_SWAP_B32)) + .addDef(X1.Reg, 0, X1.SubReg) + .addDef(Y1.Reg, 0, Y1.SubReg) + .addReg(Y1.Reg, 0, Y1.SubReg) + .addReg(X1.Reg, 0, X1.SubReg).getInstr(); + } + MovX->eraseFromParent(); + MovY.eraseFromParent(); + MachineInstr *Next = &*std::next(MovT.getIterator()); + if (MRI.use_nodbg_empty(T)) + MovT.eraseFromParent(); + else + Xop.setIsKill(false); + + return Next; + } + + return nullptr; +} + bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(MF.getFunction())) return false; @@ -285,7 +458,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo &TRI = TII->getRegisterInfo(); std::vector<unsigned> I1Defs; @@ -319,6 +491,14 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { } } + if (ST.hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 || + MI.getOpcode() == AMDGPU::COPY)) { + if (auto *NextMI = matchSwap(MI, MRI, TII)) { + Next = NextMI->getIterator(); + continue; + } + } + // Combine adjacent s_nops to use the immediate operand encoding how long // to wait. // @@ -408,14 +588,22 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { continue; } + // Shrink scalar logic operations. + if (MI.getOpcode() == AMDGPU::S_AND_B32 || + MI.getOpcode() == AMDGPU::S_OR_B32 || + MI.getOpcode() == AMDGPU::S_XOR_B32) { + if (shrinkScalarLogicOp(ST, MRI, TII, MI)) + continue; + } + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) continue; - if (!canShrink(MI, TII, TRI, MRI)) { + if (!TII->canShrink(MI, MRI)) { // Try commuting the instruction and see if that enables us to shrink // it. if (!MI.isCommutable() || !TII->commuteInstruction(MI) || - !canShrink(MI, TII, TRI, MRI)) + !TII->canShrink(MI, MRI)) continue; } @@ -488,40 +676,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // We can shrink this instruction LLVM_DEBUG(dbgs() << "Shrinking " << MI); - MachineInstrBuilder Inst32 = - BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); - - // Add the dst operand if the 32-bit encoding also has an explicit $vdst. - // For VOPC instructions, this is replaced by an implicit def of vcc. - int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); - if (Op32DstIdx != -1) { - // dst - Inst32.add(MI.getOperand(0)); - } else { - assert(MI.getOperand(0).getReg() == AMDGPU::VCC && - "Unexpected case"); - } - - - Inst32.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); - - const MachineOperand *Src1 = - TII->getNamedOperand(MI, AMDGPU::OpName::src1); - if (Src1) - Inst32.add(*Src1); - - if (Src2) { - int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); - if (Op32Src2Idx != -1) { - Inst32.add(*Src2); - } else { - // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is - // replaced with an implicit read of vcc. This was already added - // during the initial BuildMI, so find it to preserve the flags. - copyFlagsToImplicitVCC(*Inst32, *Src2); - } - } - + MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32); ++NumInstructionsShrunk; // Copy extra operands not present in the instruction definition. diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td index 7485326017b26..8a063e1a48673 100644 --- a/lib/Target/AMDGPU/SMInstructions.td +++ b/lib/Target/AMDGPU/SMInstructions.td @@ -375,83 +375,6 @@ defm S_DCACHE_DISCARD_X2 : SM_Pseudo_Discards <"s_dcache_discard_x2">; } //===----------------------------------------------------------------------===// -// Scalar Memory Patterns -//===----------------------------------------------------------------------===// - - -def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ - auto Ld = cast<LoadSDNode>(N); - return Ld->getAlignment() >= 4 && - ((((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) || (Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT)) && !N->isDivergent()) || - (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && - !Ld->isVolatile() && !N->isDivergent() && - static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N))); -}]>; - -def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">; -def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">; -def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">; -def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">; -def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">; - -multiclass SMRD_Pattern <string Instr, ValueType vt> { - - // 1. IMM offset - def : GCNPat < - (smrd_load (SMRDImm i64:$sbase, i32:$offset)), - (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0)) - >; - - // 2. SGPR offset - def : GCNPat < - (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)), - (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0)) - >; -} - -let OtherPredicates = [isSICI] in { -def : GCNPat < - (i64 (readcyclecounter)), - (S_MEMTIME) ->; -} - -// Global and constant loads can be selected to either MUBUF or SMRD -// instructions, but SMRD instructions are faster so we want the instruction -// selector to prefer those. -let AddedComplexity = 100 in { - -defm : SMRD_Pattern <"S_LOAD_DWORD", i32>; -defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>; -defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>; -defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>; -defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; - -// 1. Offset as an immediate -def SM_LOAD_PATTERN : GCNPat < // name this pattern to reuse AddedComplexity on CI - (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)), - (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, 0) ->; - -// 2. Offset loaded in an 32bit SGPR -def : GCNPat < - (SIload_constant v4i32:$sbase, i32:$offset), - (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, 0) ->; - -} // End let AddedComplexity = 100 - -let OtherPredicates = [isVI] in { - -def : GCNPat < - (i64 (readcyclecounter)), - (S_MEMREALTIME) ->; - -} // let OtherPredicates = [isVI] - - -//===----------------------------------------------------------------------===// // Targets //===----------------------------------------------------------------------===// @@ -757,25 +680,97 @@ class SMRD_Real_ci <bits<5> op, SM_Pseudo ps> def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>; -let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity in { +//===----------------------------------------------------------------------===// +// Scalar Memory Patterns +//===----------------------------------------------------------------------===// + +def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformLoad(N);}]>; + +def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">; +def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">; +def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">; +def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">; +def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">; + +multiclass SMRD_Pattern <string Instr, ValueType vt> { + + // 1. IMM offset + def : GCNPat < + (smrd_load (SMRDImm i64:$sbase, i32:$offset)), + (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0)) + >; + + // 2. 32-bit IMM offset on CI + def : GCNPat < + (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), + (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> { + let OtherPredicates = [isCIOnly]; + } + + // 3. SGPR offset + def : GCNPat < + (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0)) + >; +} + +multiclass SMLoad_Pattern <string Instr, ValueType vt> { + // 1. Offset as an immediate + def : GCNPat < + (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc), + (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc))) + >; + + // 2. 32-bit IMM offset on CI + def : GCNPat < + (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc)), + (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, (as_i1imm $glc))> { + let OtherPredicates = [isCIOnly]; + } -class SMRD_Pattern_ci <string Instr, ValueType vt> : GCNPat < - (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), - (vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> { - let OtherPredicates = [isCIOnly]; + // 3. Offset loaded in an 32bit SGPR + def : GCNPat < + (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc), + (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc))) + >; } -def : SMRD_Pattern_ci <"S_LOAD_DWORD", i32>; -def : SMRD_Pattern_ci <"S_LOAD_DWORDX2", v2i32>; -def : SMRD_Pattern_ci <"S_LOAD_DWORDX4", v4i32>; -def : SMRD_Pattern_ci <"S_LOAD_DWORDX8", v8i32>; -def : SMRD_Pattern_ci <"S_LOAD_DWORDX16", v16i32>; +// Global and constant loads can be selected to either MUBUF or SMRD +// instructions, but SMRD instructions are faster so we want the instruction +// selector to prefer those. +let AddedComplexity = 100 in { + +defm : SMRD_Pattern <"S_LOAD_DWORD", i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; + +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", i32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2i32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4i32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8i32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16i32>; + +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", f32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2f32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4f32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8f32>; +defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>; +} // End let AddedComplexity = 100 +let OtherPredicates = [isSICI] in { def : GCNPat < - (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)), - (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, 0)> { - let OtherPredicates = [isCI]; // should this be isCIOnly? + (i64 (readcyclecounter)), + (S_MEMTIME) +>; } -} // End let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity +let OtherPredicates = [isVI] in { +def : GCNPat < + (i64 (readcyclecounter)), + (S_MEMREALTIME) +>; + +} // let OtherPredicates = [isVI] diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td index 6f5db9644c868..ca5e981ac5c25 100644 --- a/lib/Target/AMDGPU/SOPInstructions.td +++ b/lib/Target/AMDGPU/SOPInstructions.td @@ -336,42 +336,54 @@ class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo < "$sdst, $src0, $src1", pattern >; +class UniformUnaryFrag<SDPatternOperator Op> : PatFrag < + (ops node:$src0), + (Op $src0), + [{ return !N->isDivergent(); }] +>; + +class UniformBinFrag<SDPatternOperator Op> : PatFrag < + (ops node:$src0, node:$src1), + (Op $src0, $src1), + [{ return !N->isDivergent(); }] +>; + let Defs = [SCC] in { // Carry out goes to SCC let isCommutable = 1 in { def S_ADD_U32 : SOP2_32 <"s_add_u32">; def S_ADD_I32 : SOP2_32 <"s_add_i32", - [(set i32:$sdst, (add SSrc_b32:$src0, SSrc_b32:$src1))] + [(set i32:$sdst, (UniformBinFrag<add> SSrc_b32:$src0, SSrc_b32:$src1))] >; } // End isCommutable = 1 def S_SUB_U32 : SOP2_32 <"s_sub_u32">; def S_SUB_I32 : SOP2_32 <"s_sub_i32", - [(set i32:$sdst, (sub SSrc_b32:$src0, SSrc_b32:$src1))] + [(set i32:$sdst, (UniformBinFrag<sub> SSrc_b32:$src0, SSrc_b32:$src1))] >; let Uses = [SCC] in { // Carry in comes from SCC let isCommutable = 1 in { def S_ADDC_U32 : SOP2_32 <"s_addc_u32", - [(set i32:$sdst, (adde (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>; + [(set i32:$sdst, (UniformBinFrag<adde> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>; } // End isCommutable = 1 def S_SUBB_U32 : SOP2_32 <"s_subb_u32", - [(set i32:$sdst, (sube (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>; + [(set i32:$sdst, (UniformBinFrag<sube> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>; } // End Uses = [SCC] let isCommutable = 1 in { def S_MIN_I32 : SOP2_32 <"s_min_i32", - [(set i32:$sdst, (smin i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag<smin> i32:$src0, i32:$src1))] >; def S_MIN_U32 : SOP2_32 <"s_min_u32", - [(set i32:$sdst, (umin i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag<umin> i32:$src0, i32:$src1))] >; def S_MAX_I32 : SOP2_32 <"s_max_i32", - [(set i32:$sdst, (smax i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag<smax> i32:$src0, i32:$src1))] >; def S_MAX_U32 : SOP2_32 <"s_max_u32", - [(set i32:$sdst, (umax i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag<umax> i32:$src0, i32:$src1))] >; } // End isCommutable = 1 } // End Defs = [SCC] @@ -385,27 +397,27 @@ let Uses = [SCC] in { let Defs = [SCC] in { let isCommutable = 1 in { def S_AND_B32 : SOP2_32 <"s_and_b32", - [(set i32:$sdst, (and i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag<and> i32:$src0, i32:$src1))] >; def S_AND_B64 : SOP2_64 <"s_and_b64", - [(set i64:$sdst, (and i64:$src0, i64:$src1))] + [(set i64:$sdst, (UniformBinFrag<and> i64:$src0, i64:$src1))] >; def S_OR_B32 : SOP2_32 <"s_or_b32", - [(set i32:$sdst, (or i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag<or> i32:$src0, i32:$src1))] >; def S_OR_B64 : SOP2_64 <"s_or_b64", - [(set i64:$sdst, (or i64:$src0, i64:$src1))] + [(set i64:$sdst, (UniformBinFrag<or> i64:$src0, i64:$src1))] >; def S_XOR_B32 : SOP2_32 <"s_xor_b32", - [(set i32:$sdst, (xor i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag<xor> i32:$src0, i32:$src1))] >; def S_XOR_B64 : SOP2_64 <"s_xor_b64", - [(set i64:$sdst, (xor i64:$src0, i64:$src1))] + [(set i64:$sdst, (UniformBinFrag<xor> i64:$src0, i64:$src1))] >; def S_XNOR_B32 : SOP2_32 <"s_xnor_b32", @@ -415,45 +427,71 @@ def S_XNOR_B32 : SOP2_32 <"s_xnor_b32", def S_XNOR_B64 : SOP2_64 <"s_xnor_b64", [(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))] >; + +def S_NAND_B32 : SOP2_32 <"s_nand_b32", + [(set i32:$sdst, (not (and_oneuse i32:$src0, i32:$src1)))] +>; + +def S_NAND_B64 : SOP2_64 <"s_nand_b64", + [(set i64:$sdst, (not (and_oneuse i64:$src0, i64:$src1)))] +>; + +def S_NOR_B32 : SOP2_32 <"s_nor_b32", + [(set i32:$sdst, (not (or_oneuse i32:$src0, i32:$src1)))] +>; + +def S_NOR_B64 : SOP2_64 <"s_nor_b64", + [(set i64:$sdst, (not (or_oneuse i64:$src0, i64:$src1)))] +>; } // End isCommutable = 1 -def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32">; -def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64">; -def S_ORN2_B32 : SOP2_32 <"s_orn2_b32">; -def S_ORN2_B64 : SOP2_64 <"s_orn2_b64">; -def S_NAND_B32 : SOP2_32 <"s_nand_b32">; -def S_NAND_B64 : SOP2_64 <"s_nand_b64">; -def S_NOR_B32 : SOP2_32 <"s_nor_b32">; -def S_NOR_B64 : SOP2_64 <"s_nor_b64">; +def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32", + [(set i32:$sdst, (UniformBinFrag<and> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))] +>; + +def S_ANDN2_B64 : SOP2_64 <"s_andn2_b64", + [(set i64:$sdst, (UniformBinFrag<and> i64:$src0, (UniformUnaryFrag<not> i64:$src1)))] +>; + +def S_ORN2_B32 : SOP2_32 <"s_orn2_b32", + [(set i32:$sdst, (UniformBinFrag<or> i32:$src0, (UniformUnaryFrag<not> i32:$src1)))] +>; + +def S_ORN2_B64 : SOP2_64 <"s_orn2_b64", + [(set i64:$sdst, (UniformBinFrag<or> i64:$src0, (UniformUnaryFrag<not> i64:$src1)))] +>; } // End Defs = [SCC] // Use added complexity so these patterns are preferred to the VALU patterns. let AddedComplexity = 1 in { let Defs = [SCC] in { +// TODO: b64 versions require VOP3 change since v_lshlrev_b64 is VOP3 def S_LSHL_B32 : SOP2_32 <"s_lshl_b32", - [(set i32:$sdst, (shl i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag<shl> i32:$src0, i32:$src1))] >; def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64", - [(set i64:$sdst, (shl i64:$src0, i32:$src1))] + [(set i64:$sdst, (UniformBinFrag<shl> i64:$src0, i32:$src1))] >; def S_LSHR_B32 : SOP2_32 <"s_lshr_b32", - [(set i32:$sdst, (srl i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag<srl> i32:$src0, i32:$src1))] >; def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64", - [(set i64:$sdst, (srl i64:$src0, i32:$src1))] + [(set i64:$sdst, (UniformBinFrag<srl> i64:$src0, i32:$src1))] >; def S_ASHR_I32 : SOP2_32 <"s_ashr_i32", - [(set i32:$sdst, (sra i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag<sra> i32:$src0, i32:$src1))] >; def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64", - [(set i64:$sdst, (sra i64:$src0, i32:$src1))] + [(set i64:$sdst, (UniformBinFrag<sra> i64:$src0, i32:$src1))] >; } // End Defs = [SCC] def S_BFM_B32 : SOP2_32 <"s_bfm_b32", - [(set i32:$sdst, (AMDGPUbfm i32:$src0, i32:$src1))]>; + [(set i32:$sdst, (UniformBinFrag<AMDGPUbfm> i32:$src0, i32:$src1))]>; def S_BFM_B64 : SOP2_64_32_32 <"s_bfm_b64">; + +// TODO: S_MUL_I32 require V_MUL_LO_I32 from VOP3 change def S_MUL_I32 : SOP2_32 <"s_mul_i32", [(set i32:$sdst, (mul i32:$src0, i32:$src1))]> { let isCommutable = 1; diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 4eba193823154..54c866bdc63ce 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -128,131 +128,127 @@ int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) { return NewInfo ? NewInfo->Opcode : -1; } -// Wrapper for Tablegen'd function. enum Subtarget is not defined in any -// header files, so we need to wrap it in a function that takes unsigned -// instead. -int getMCOpcode(uint16_t Opcode, unsigned Gen) { - return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen)); +struct MUBUFInfo { + uint16_t Opcode; + uint16_t BaseOpcode; + uint8_t dwords; + bool has_vaddr; + bool has_srsrc; + bool has_soffset; +}; + +#define GET_MUBUFInfoTable_DECL +#define GET_MUBUFInfoTable_IMPL +#include "AMDGPUGenSearchableTables.inc" + +int getMUBUFBaseOpcode(unsigned Opc) { + const MUBUFInfo *Info = getMUBUFInfoFromOpcode(Opc); + return Info ? Info->BaseOpcode : -1; } -namespace IsaInfo { +int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords) { + const MUBUFInfo *Info = getMUBUFInfoFromBaseOpcodeAndDwords(BaseOpc, Dwords); + return Info ? Info->Opcode : -1; +} -IsaVersion getIsaVersion(const FeatureBitset &Features) { - // GCN GFX6 (Southern Islands (SI)). - if (Features.test(FeatureISAVersion6_0_0)) - return {6, 0, 0}; - if (Features.test(FeatureISAVersion6_0_1)) - return {6, 0, 1}; +int getMUBUFDwords(unsigned Opc) { + const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc); + return Info ? Info->dwords : 0; +} - // GCN GFX7 (Sea Islands (CI)). - if (Features.test(FeatureISAVersion7_0_0)) - return {7, 0, 0}; - if (Features.test(FeatureISAVersion7_0_1)) - return {7, 0, 1}; - if (Features.test(FeatureISAVersion7_0_2)) - return {7, 0, 2}; - if (Features.test(FeatureISAVersion7_0_3)) - return {7, 0, 3}; - if (Features.test(FeatureISAVersion7_0_4)) - return {7, 0, 4}; - if (Features.test(FeatureSeaIslands)) - return {7, 0, 0}; +bool getMUBUFHasVAddr(unsigned Opc) { + const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc); + return Info ? Info->has_vaddr : false; +} - // GCN GFX8 (Volcanic Islands (VI)). - if (Features.test(FeatureISAVersion8_0_1)) - return {8, 0, 1}; - if (Features.test(FeatureISAVersion8_0_2)) - return {8, 0, 2}; - if (Features.test(FeatureISAVersion8_0_3)) - return {8, 0, 3}; - if (Features.test(FeatureISAVersion8_1_0)) - return {8, 1, 0}; - if (Features.test(FeatureVolcanicIslands)) - return {8, 0, 0}; +bool getMUBUFHasSrsrc(unsigned Opc) { + const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc); + return Info ? Info->has_srsrc : false; +} - // GCN GFX9. - if (Features.test(FeatureISAVersion9_0_0)) - return {9, 0, 0}; - if (Features.test(FeatureISAVersion9_0_2)) - return {9, 0, 2}; - if (Features.test(FeatureISAVersion9_0_4)) - return {9, 0, 4}; - if (Features.test(FeatureISAVersion9_0_6)) - return {9, 0, 6}; - if (Features.test(FeatureGFX9)) - return {9, 0, 0}; +bool getMUBUFHasSoffset(unsigned Opc) { + const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc); + return Info ? Info->has_soffset : false; +} - if (Features.test(FeatureSouthernIslands)) - return {0, 0, 0}; - return {7, 0, 0}; +// Wrapper for Tablegen'd function. enum Subtarget is not defined in any +// header files, so we need to wrap it in a function that takes unsigned +// instead. +int getMCOpcode(uint16_t Opcode, unsigned Gen) { + return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen)); } +namespace IsaInfo { + void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) { auto TargetTriple = STI->getTargetTriple(); - auto ISAVersion = IsaInfo::getIsaVersion(STI->getFeatureBits()); + auto Version = getIsaVersion(STI->getCPU()); Stream << TargetTriple.getArchName() << '-' << TargetTriple.getVendorName() << '-' << TargetTriple.getOSName() << '-' << TargetTriple.getEnvironmentName() << '-' << "gfx" - << ISAVersion.Major - << ISAVersion.Minor - << ISAVersion.Stepping; + << Version.Major + << Version.Minor + << Version.Stepping; if (hasXNACK(*STI)) Stream << "+xnack"; + if (hasSRAMECC(*STI)) + Stream << "+sram-ecc"; Stream.flush(); } bool hasCodeObjectV3(const MCSubtargetInfo *STI) { - return STI->getFeatureBits().test(FeatureCodeObjectV3); + return STI->getTargetTriple().getOS() == Triple::AMDHSA && + STI->getFeatureBits().test(FeatureCodeObjectV3); } -unsigned getWavefrontSize(const FeatureBitset &Features) { - if (Features.test(FeatureWavefrontSize16)) +unsigned getWavefrontSize(const MCSubtargetInfo *STI) { + if (STI->getFeatureBits().test(FeatureWavefrontSize16)) return 16; - if (Features.test(FeatureWavefrontSize32)) + if (STI->getFeatureBits().test(FeatureWavefrontSize32)) return 32; return 64; } -unsigned getLocalMemorySize(const FeatureBitset &Features) { - if (Features.test(FeatureLocalMemorySize32768)) +unsigned getLocalMemorySize(const MCSubtargetInfo *STI) { + if (STI->getFeatureBits().test(FeatureLocalMemorySize32768)) return 32768; - if (Features.test(FeatureLocalMemorySize65536)) + if (STI->getFeatureBits().test(FeatureLocalMemorySize65536)) return 65536; return 0; } -unsigned getEUsPerCU(const FeatureBitset &Features) { +unsigned getEUsPerCU(const MCSubtargetInfo *STI) { return 4; } -unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features, +unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize) { - if (!Features.test(FeatureGCN)) + if (!STI->getFeatureBits().test(FeatureGCN)) return 8; - unsigned N = getWavesPerWorkGroup(Features, FlatWorkGroupSize); + unsigned N = getWavesPerWorkGroup(STI, FlatWorkGroupSize); if (N == 1) return 40; N = 40 / N; return std::min(N, 16u); } -unsigned getMaxWavesPerCU(const FeatureBitset &Features) { - return getMaxWavesPerEU() * getEUsPerCU(Features); +unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI) { + return getMaxWavesPerEU() * getEUsPerCU(STI); } -unsigned getMaxWavesPerCU(const FeatureBitset &Features, +unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize) { - return getWavesPerWorkGroup(Features, FlatWorkGroupSize); + return getWavesPerWorkGroup(STI, FlatWorkGroupSize); } -unsigned getMinWavesPerEU(const FeatureBitset &Features) { +unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) { return 1; } @@ -261,89 +257,89 @@ unsigned getMaxWavesPerEU() { return 10; } -unsigned getMaxWavesPerEU(const FeatureBitset &Features, +unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize) { - return alignTo(getMaxWavesPerCU(Features, FlatWorkGroupSize), - getEUsPerCU(Features)) / getEUsPerCU(Features); + return alignTo(getMaxWavesPerCU(STI, FlatWorkGroupSize), + getEUsPerCU(STI)) / getEUsPerCU(STI); } -unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features) { +unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI) { return 1; } -unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features) { +unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI) { return 2048; } -unsigned getWavesPerWorkGroup(const FeatureBitset &Features, +unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize) { - return alignTo(FlatWorkGroupSize, getWavefrontSize(Features)) / - getWavefrontSize(Features); + return alignTo(FlatWorkGroupSize, getWavefrontSize(STI)) / + getWavefrontSize(STI); } -unsigned getSGPRAllocGranule(const FeatureBitset &Features) { - IsaVersion Version = getIsaVersion(Features); +unsigned getSGPRAllocGranule(const MCSubtargetInfo *STI) { + IsaVersion Version = getIsaVersion(STI->getCPU()); if (Version.Major >= 8) return 16; return 8; } -unsigned getSGPREncodingGranule(const FeatureBitset &Features) { +unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI) { return 8; } -unsigned getTotalNumSGPRs(const FeatureBitset &Features) { - IsaVersion Version = getIsaVersion(Features); +unsigned getTotalNumSGPRs(const MCSubtargetInfo *STI) { + IsaVersion Version = getIsaVersion(STI->getCPU()); if (Version.Major >= 8) return 800; return 512; } -unsigned getAddressableNumSGPRs(const FeatureBitset &Features) { - if (Features.test(FeatureSGPRInitBug)) +unsigned getAddressableNumSGPRs(const MCSubtargetInfo *STI) { + if (STI->getFeatureBits().test(FeatureSGPRInitBug)) return FIXED_NUM_SGPRS_FOR_INIT_BUG; - IsaVersion Version = getIsaVersion(Features); + IsaVersion Version = getIsaVersion(STI->getCPU()); if (Version.Major >= 8) return 102; return 104; } -unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU) { +unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { assert(WavesPerEU != 0); if (WavesPerEU >= getMaxWavesPerEU()) return 0; - unsigned MinNumSGPRs = getTotalNumSGPRs(Features) / (WavesPerEU + 1); - if (Features.test(FeatureTrapHandler)) + unsigned MinNumSGPRs = getTotalNumSGPRs(STI) / (WavesPerEU + 1); + if (STI->getFeatureBits().test(FeatureTrapHandler)) MinNumSGPRs -= std::min(MinNumSGPRs, (unsigned)TRAP_NUM_SGPRS); - MinNumSGPRs = alignDown(MinNumSGPRs, getSGPRAllocGranule(Features)) + 1; - return std::min(MinNumSGPRs, getAddressableNumSGPRs(Features)); + MinNumSGPRs = alignDown(MinNumSGPRs, getSGPRAllocGranule(STI)) + 1; + return std::min(MinNumSGPRs, getAddressableNumSGPRs(STI)); } -unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU, +unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, bool Addressable) { assert(WavesPerEU != 0); - IsaVersion Version = getIsaVersion(Features); - unsigned AddressableNumSGPRs = getAddressableNumSGPRs(Features); + IsaVersion Version = getIsaVersion(STI->getCPU()); + unsigned AddressableNumSGPRs = getAddressableNumSGPRs(STI); if (Version.Major >= 8 && !Addressable) AddressableNumSGPRs = 112; - unsigned MaxNumSGPRs = getTotalNumSGPRs(Features) / WavesPerEU; - if (Features.test(FeatureTrapHandler)) + unsigned MaxNumSGPRs = getTotalNumSGPRs(STI) / WavesPerEU; + if (STI->getFeatureBits().test(FeatureTrapHandler)) MaxNumSGPRs -= std::min(MaxNumSGPRs, (unsigned)TRAP_NUM_SGPRS); - MaxNumSGPRs = alignDown(MaxNumSGPRs, getSGPRAllocGranule(Features)); + MaxNumSGPRs = alignDown(MaxNumSGPRs, getSGPRAllocGranule(STI)); return std::min(MaxNumSGPRs, AddressableNumSGPRs); } -unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed, +unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed) { unsigned ExtraSGPRs = 0; if (VCCUsed) ExtraSGPRs = 2; - IsaVersion Version = getIsaVersion(Features); + IsaVersion Version = getIsaVersion(STI->getCPU()); if (Version.Major < 8) { if (FlatScrUsed) ExtraSGPRs = 4; @@ -358,74 +354,74 @@ unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed, return ExtraSGPRs; } -unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed, +unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed) { - return getNumExtraSGPRs(Features, VCCUsed, FlatScrUsed, - Features[AMDGPU::FeatureXNACK]); + return getNumExtraSGPRs(STI, VCCUsed, FlatScrUsed, + STI->getFeatureBits().test(AMDGPU::FeatureXNACK)); } -unsigned getNumSGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs) { - NumSGPRs = alignTo(std::max(1u, NumSGPRs), getSGPREncodingGranule(Features)); +unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) { + NumSGPRs = alignTo(std::max(1u, NumSGPRs), getSGPREncodingGranule(STI)); // SGPRBlocks is actual number of SGPR blocks minus 1. - return NumSGPRs / getSGPREncodingGranule(Features) - 1; + return NumSGPRs / getSGPREncodingGranule(STI) - 1; } -unsigned getVGPRAllocGranule(const FeatureBitset &Features) { +unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI) { return 4; } -unsigned getVGPREncodingGranule(const FeatureBitset &Features) { - return getVGPRAllocGranule(Features); +unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI) { + return getVGPRAllocGranule(STI); } -unsigned getTotalNumVGPRs(const FeatureBitset &Features) { +unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) { return 256; } -unsigned getAddressableNumVGPRs(const FeatureBitset &Features) { - return getTotalNumVGPRs(Features); +unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) { + return getTotalNumVGPRs(STI); } -unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) { +unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { assert(WavesPerEU != 0); if (WavesPerEU >= getMaxWavesPerEU()) return 0; unsigned MinNumVGPRs = - alignDown(getTotalNumVGPRs(Features) / (WavesPerEU + 1), - getVGPRAllocGranule(Features)) + 1; - return std::min(MinNumVGPRs, getAddressableNumVGPRs(Features)); + alignDown(getTotalNumVGPRs(STI) / (WavesPerEU + 1), + getVGPRAllocGranule(STI)) + 1; + return std::min(MinNumVGPRs, getAddressableNumVGPRs(STI)); } -unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) { +unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) { assert(WavesPerEU != 0); - unsigned MaxNumVGPRs = alignDown(getTotalNumVGPRs(Features) / WavesPerEU, - getVGPRAllocGranule(Features)); - unsigned AddressableNumVGPRs = getAddressableNumVGPRs(Features); + unsigned MaxNumVGPRs = alignDown(getTotalNumVGPRs(STI) / WavesPerEU, + getVGPRAllocGranule(STI)); + unsigned AddressableNumVGPRs = getAddressableNumVGPRs(STI); return std::min(MaxNumVGPRs, AddressableNumVGPRs); } -unsigned getNumVGPRBlocks(const FeatureBitset &Features, unsigned NumVGPRs) { - NumVGPRs = alignTo(std::max(1u, NumVGPRs), getVGPREncodingGranule(Features)); +unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs) { + NumVGPRs = alignTo(std::max(1u, NumVGPRs), getVGPREncodingGranule(STI)); // VGPRBlocks is actual number of VGPR blocks minus 1. - return NumVGPRs / getVGPREncodingGranule(Features) - 1; + return NumVGPRs / getVGPREncodingGranule(STI) - 1; } } // end namespace IsaInfo void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, - const FeatureBitset &Features) { - IsaInfo::IsaVersion ISA = IsaInfo::getIsaVersion(Features); + const MCSubtargetInfo *STI) { + IsaVersion Version = getIsaVersion(STI->getCPU()); memset(&Header, 0, sizeof(Header)); Header.amd_kernel_code_version_major = 1; Header.amd_kernel_code_version_minor = 2; Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU - Header.amd_machine_version_major = ISA.Major; - Header.amd_machine_version_minor = ISA.Minor; - Header.amd_machine_version_stepping = ISA.Stepping; + Header.amd_machine_version_major = Version.Major; + Header.amd_machine_version_minor = Version.Minor; + Header.amd_machine_version_stepping = Version.Stepping; Header.kernel_code_entry_byte_offset = sizeof(Header); // wavefront_size is specified as a power of 2: 2^6 = 64 threads. Header.wavefront_size = 6; @@ -513,7 +509,7 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F, return Ints; } -unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version) { +unsigned getVmcntBitMask(const IsaVersion &Version) { unsigned VmcntLo = (1 << getVmcntBitWidthLo()) - 1; if (Version.Major < 9) return VmcntLo; @@ -522,15 +518,15 @@ unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version) { return VmcntLo | VmcntHi; } -unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version) { +unsigned getExpcntBitMask(const IsaVersion &Version) { return (1 << getExpcntBitWidth()) - 1; } -unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version) { +unsigned getLgkmcntBitMask(const IsaVersion &Version) { return (1 << getLgkmcntBitWidth()) - 1; } -unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version) { +unsigned getWaitcntBitMask(const IsaVersion &Version) { unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(), getVmcntBitWidthLo()); unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth()); unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth()); @@ -542,7 +538,7 @@ unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version) { return Waitcnt | VmcntHi; } -unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) { +unsigned decodeVmcnt(const IsaVersion &Version, unsigned Waitcnt) { unsigned VmcntLo = unpackBits(Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo()); if (Version.Major < 9) @@ -554,22 +550,30 @@ unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) { return VmcntLo | VmcntHi; } -unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) { +unsigned decodeExpcnt(const IsaVersion &Version, unsigned Waitcnt) { return unpackBits(Waitcnt, getExpcntBitShift(), getExpcntBitWidth()); } -unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) { +unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt) { return unpackBits(Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth()); } -void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, +void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt) { Vmcnt = decodeVmcnt(Version, Waitcnt); Expcnt = decodeExpcnt(Version, Waitcnt); Lgkmcnt = decodeLgkmcnt(Version, Waitcnt); } -unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, +Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded) { + Waitcnt Decoded; + Decoded.VmCnt = decodeVmcnt(Version, Encoded); + Decoded.ExpCnt = decodeExpcnt(Version, Encoded); + Decoded.LgkmCnt = decodeLgkmcnt(Version, Encoded); + return Decoded; +} + +unsigned encodeVmcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned Vmcnt) { Waitcnt = packBits(Vmcnt, Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo()); @@ -580,17 +584,17 @@ unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, return packBits(Vmcnt, Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi()); } -unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, +unsigned encodeExpcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned Expcnt) { return packBits(Expcnt, Waitcnt, getExpcntBitShift(), getExpcntBitWidth()); } -unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, +unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned Lgkmcnt) { return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth()); } -unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version, +unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt) { unsigned Waitcnt = getWaitcntBitMask(Version); Waitcnt = encodeVmcnt(Version, Waitcnt, Vmcnt); @@ -599,6 +603,10 @@ unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version, return Waitcnt; } +unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded) { + return encodeWaitcnt(Version, Decoded.VmCnt, Decoded.ExpCnt, Decoded.LgkmCnt); +} + unsigned getInitialPSInputAddr(const Function &F) { return getIntegerAttribute(F, "InitialPSInputAddr", 0); } @@ -643,6 +651,10 @@ bool hasXNACK(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureXNACK]; } +bool hasSRAMECC(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureSRAMECC]; +} + bool hasMIMG_R128(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128]; } @@ -798,6 +810,7 @@ unsigned getRegBitWidth(unsigned RCID) { case AMDGPU::VS_64RegClassID: case AMDGPU::SReg_64RegClassID: case AMDGPU::VReg_64RegClassID: + case AMDGPU::SReg_64_XEXECRegClassID: return 64; case AMDGPU::VReg_96RegClassID: return 96; @@ -935,27 +948,50 @@ bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) { isUInt<20>(EncodedOffset) : isUInt<8>(EncodedOffset); } -} // end namespace AMDGPU - -} // end namespace llvm - -namespace llvm { -namespace AMDGPU { +// Given Imm, split it into the values to put into the SOffset and ImmOffset +// fields in an MUBUF instruction. Return false if it is not possible (due to a +// hardware bug needing a workaround). +// +// The required alignment ensures that individual address components remain +// aligned if they are aligned to begin with. It also ensures that additional +// offsets within the given alignment can be added to the resulting ImmOffset. +bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, + const GCNSubtarget *Subtarget, uint32_t Align) { + const uint32_t MaxImm = alignDown(4095, Align); + uint32_t Overflow = 0; -AMDGPUAS getAMDGPUAS(Triple T) { - AMDGPUAS AS; - AS.FLAT_ADDRESS = 0; - AS.PRIVATE_ADDRESS = 5; - AS.REGION_ADDRESS = 2; - return AS; -} + if (Imm > MaxImm) { + if (Imm <= MaxImm + 64) { + // Use an SOffset inline constant for 4..64 + Overflow = Imm - MaxImm; + Imm = MaxImm; + } else { + // Try to keep the same value in SOffset for adjacent loads, so that + // the corresponding register contents can be re-used. + // + // Load values with all low-bits (except for alignment bits) set into + // SOffset, so that a larger range of values can be covered using + // s_movk_i32. + // + // Atomic operations fail to work correctly when individual address + // components are unaligned, even if their sum is aligned. + uint32_t High = (Imm + Align) & ~4095; + uint32_t Low = (Imm + Align) & 4095; + Imm = Low; + Overflow = High - Align; + } + } -AMDGPUAS getAMDGPUAS(const TargetMachine &M) { - return getAMDGPUAS(M.getTargetTriple()); -} + // There is a hardware bug in SI and CI which prevents address clamping in + // MUBUF instructions from working correctly with SOffsets. The immediate + // offset is unaffected. + if (Overflow > 0 && + Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) + return false; -AMDGPUAS getAMDGPUAS(const Module &M) { - return getAMDGPUAS(Triple(M.getTargetTriple())); + ImmOffset = Imm; + SOffset = Overflow; + return true; } namespace { diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 5b7af8268cdaf..20123ed4ac815 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -19,6 +19,7 @@ #include "llvm/Support/AMDHSAKernelDescriptor.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetParser.h" #include <cstdint> #include <string> #include <utility> @@ -26,8 +27,10 @@ namespace llvm { class Argument; +class AMDGPUSubtarget; class FeatureBitset; class Function; +class GCNSubtarget; class GlobalValue; class MCContext; class MCRegisterClass; @@ -54,16 +57,6 @@ enum { TRAP_NUM_SGPRS = 16 }; -/// Instruction set architecture version. -struct IsaVersion { - unsigned Major; - unsigned Minor; - unsigned Stepping; -}; - -/// \returns Isa version for given subtarget \p Features. -IsaVersion getIsaVersion(const FeatureBitset &Features); - /// Streams isa version string for given subtarget \p STI into \p Stream. void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream); @@ -71,114 +64,114 @@ void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream); /// false otherwise. bool hasCodeObjectV3(const MCSubtargetInfo *STI); -/// \returns Wavefront size for given subtarget \p Features. -unsigned getWavefrontSize(const FeatureBitset &Features); +/// \returns Wavefront size for given subtarget \p STI. +unsigned getWavefrontSize(const MCSubtargetInfo *STI); -/// \returns Local memory size in bytes for given subtarget \p Features. -unsigned getLocalMemorySize(const FeatureBitset &Features); +/// \returns Local memory size in bytes for given subtarget \p STI. +unsigned getLocalMemorySize(const MCSubtargetInfo *STI); /// \returns Number of execution units per compute unit for given subtarget \p -/// Features. -unsigned getEUsPerCU(const FeatureBitset &Features); +/// STI. +unsigned getEUsPerCU(const MCSubtargetInfo *STI); /// \returns Maximum number of work groups per compute unit for given subtarget -/// \p Features and limited by given \p FlatWorkGroupSize. -unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features, +/// \p STI and limited by given \p FlatWorkGroupSize. +unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize); /// \returns Maximum number of waves per compute unit for given subtarget \p -/// Features without any kind of limitation. -unsigned getMaxWavesPerCU(const FeatureBitset &Features); +/// STI without any kind of limitation. +unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI); /// \returns Maximum number of waves per compute unit for given subtarget \p -/// Features and limited by given \p FlatWorkGroupSize. -unsigned getMaxWavesPerCU(const FeatureBitset &Features, +/// STI and limited by given \p FlatWorkGroupSize. +unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize); /// \returns Minimum number of waves per execution unit for given subtarget \p -/// Features. -unsigned getMinWavesPerEU(const FeatureBitset &Features); +/// STI. +unsigned getMinWavesPerEU(const MCSubtargetInfo *STI); /// \returns Maximum number of waves per execution unit for given subtarget \p -/// Features without any kind of limitation. +/// STI without any kind of limitation. unsigned getMaxWavesPerEU(); /// \returns Maximum number of waves per execution unit for given subtarget \p -/// Features and limited by given \p FlatWorkGroupSize. -unsigned getMaxWavesPerEU(const FeatureBitset &Features, +/// STI and limited by given \p FlatWorkGroupSize. +unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize); -/// \returns Minimum flat work group size for given subtarget \p Features. -unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features); +/// \returns Minimum flat work group size for given subtarget \p STI. +unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI); -/// \returns Maximum flat work group size for given subtarget \p Features. -unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features); +/// \returns Maximum flat work group size for given subtarget \p STI. +unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI); -/// \returns Number of waves per work group for given subtarget \p Features and +/// \returns Number of waves per work group for given subtarget \p STI and /// limited by given \p FlatWorkGroupSize. -unsigned getWavesPerWorkGroup(const FeatureBitset &Features, +unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize); -/// \returns SGPR allocation granularity for given subtarget \p Features. -unsigned getSGPRAllocGranule(const FeatureBitset &Features); +/// \returns SGPR allocation granularity for given subtarget \p STI. +unsigned getSGPRAllocGranule(const MCSubtargetInfo *STI); -/// \returns SGPR encoding granularity for given subtarget \p Features. -unsigned getSGPREncodingGranule(const FeatureBitset &Features); +/// \returns SGPR encoding granularity for given subtarget \p STI. +unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI); -/// \returns Total number of SGPRs for given subtarget \p Features. -unsigned getTotalNumSGPRs(const FeatureBitset &Features); +/// \returns Total number of SGPRs for given subtarget \p STI. +unsigned getTotalNumSGPRs(const MCSubtargetInfo *STI); -/// \returns Addressable number of SGPRs for given subtarget \p Features. -unsigned getAddressableNumSGPRs(const FeatureBitset &Features); +/// \returns Addressable number of SGPRs for given subtarget \p STI. +unsigned getAddressableNumSGPRs(const MCSubtargetInfo *STI); /// \returns Minimum number of SGPRs that meets the given number of waves per -/// execution unit requirement for given subtarget \p Features. -unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU); +/// execution unit requirement for given subtarget \p STI. +unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU); /// \returns Maximum number of SGPRs that meets the given number of waves per -/// execution unit requirement for given subtarget \p Features. -unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU, +/// execution unit requirement for given subtarget \p STI. +unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, bool Addressable); /// \returns Number of extra SGPRs implicitly required by given subtarget \p -/// Features when the given special registers are used. -unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed, +/// STI when the given special registers are used. +unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed); /// \returns Number of extra SGPRs implicitly required by given subtarget \p -/// Features when the given special registers are used. XNACK is inferred from -/// \p Features. -unsigned getNumExtraSGPRs(const FeatureBitset &Features, bool VCCUsed, +/// STI when the given special registers are used. XNACK is inferred from +/// \p STI. +unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed); -/// \returns Number of SGPR blocks needed for given subtarget \p Features when +/// \returns Number of SGPR blocks needed for given subtarget \p STI when /// \p NumSGPRs are used. \p NumSGPRs should already include any special /// register counts. -unsigned getNumSGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs); +unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs); -/// \returns VGPR allocation granularity for given subtarget \p Features. -unsigned getVGPRAllocGranule(const FeatureBitset &Features); +/// \returns VGPR allocation granularity for given subtarget \p STI. +unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI); -/// \returns VGPR encoding granularity for given subtarget \p Features. -unsigned getVGPREncodingGranule(const FeatureBitset &Features); +/// \returns VGPR encoding granularity for given subtarget \p STI. +unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI); -/// \returns Total number of VGPRs for given subtarget \p Features. -unsigned getTotalNumVGPRs(const FeatureBitset &Features); +/// \returns Total number of VGPRs for given subtarget \p STI. +unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI); -/// \returns Addressable number of VGPRs for given subtarget \p Features. -unsigned getAddressableNumVGPRs(const FeatureBitset &Features); +/// \returns Addressable number of VGPRs for given subtarget \p STI. +unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI); /// \returns Minimum number of VGPRs that meets given number of waves per -/// execution unit requirement for given subtarget \p Features. -unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU); +/// execution unit requirement for given subtarget \p STI. +unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU); /// \returns Maximum number of VGPRs that meets given number of waves per -/// execution unit requirement for given subtarget \p Features. -unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU); +/// execution unit requirement for given subtarget \p STI. +unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU); -/// \returns Number of VGPR blocks needed for given subtarget \p Features when +/// \returns Number of VGPR blocks needed for given subtarget \p STI when /// \p NumVGPRs are used. -unsigned getNumVGPRBlocks(const FeatureBitset &Features, unsigned NumSGPRs); +unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs); } // end namespace IsaInfo @@ -191,6 +184,7 @@ struct MIMGBaseOpcodeInfo { bool Atomic; bool AtomicX2; bool Sampler; + bool Gather4; uint8_t NumExtraArgs; bool Gradients; @@ -228,10 +222,28 @@ LLVM_READONLY int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels); LLVM_READONLY +int getMUBUFBaseOpcode(unsigned Opc); + +LLVM_READONLY +int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords); + +LLVM_READONLY +int getMUBUFDwords(unsigned Opc); + +LLVM_READONLY +bool getMUBUFHasVAddr(unsigned Opc); + +LLVM_READONLY +bool getMUBUFHasSrsrc(unsigned Opc); + +LLVM_READONLY +bool getMUBUFHasSoffset(unsigned Opc); + +LLVM_READONLY int getMCOpcode(uint16_t Opcode, unsigned Gen); void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, - const FeatureBitset &Features); + const MCSubtargetInfo *STI); amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(); @@ -265,26 +277,52 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F, std::pair<int, int> Default, bool OnlyFirstRequired = false); +/// Represents the counter values to wait for in an s_waitcnt instruction. +/// +/// Large values (including the maximum possible integer) can be used to +/// represent "don't care" waits. +struct Waitcnt { + unsigned VmCnt = ~0u; + unsigned ExpCnt = ~0u; + unsigned LgkmCnt = ~0u; + + Waitcnt() {} + Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt) + : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt) {} + + static Waitcnt allZero() { return Waitcnt(0, 0, 0); } + + bool dominates(const Waitcnt &Other) const { + return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt && + LgkmCnt <= Other.LgkmCnt; + } + + Waitcnt combined(const Waitcnt &Other) const { + return Waitcnt(std::min(VmCnt, Other.VmCnt), std::min(ExpCnt, Other.ExpCnt), + std::min(LgkmCnt, Other.LgkmCnt)); + } +}; + /// \returns Vmcnt bit mask for given isa \p Version. -unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version); +unsigned getVmcntBitMask(const IsaVersion &Version); /// \returns Expcnt bit mask for given isa \p Version. -unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version); +unsigned getExpcntBitMask(const IsaVersion &Version); /// \returns Lgkmcnt bit mask for given isa \p Version. -unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version); +unsigned getLgkmcntBitMask(const IsaVersion &Version); /// \returns Waitcnt bit mask for given isa \p Version. -unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version); +unsigned getWaitcntBitMask(const IsaVersion &Version); /// \returns Decoded Vmcnt from given \p Waitcnt for given isa \p Version. -unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt); +unsigned decodeVmcnt(const IsaVersion &Version, unsigned Waitcnt); /// \returns Decoded Expcnt from given \p Waitcnt for given isa \p Version. -unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt); +unsigned decodeExpcnt(const IsaVersion &Version, unsigned Waitcnt); /// \returns Decoded Lgkmcnt from given \p Waitcnt for given isa \p Version. -unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt); +unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt); /// Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa /// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and @@ -295,19 +333,21 @@ unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt); /// \p Vmcnt = \p Waitcnt[3:0] | \p Waitcnt[15:14] (gfx9+ only) /// \p Expcnt = \p Waitcnt[6:4] /// \p Lgkmcnt = \p Waitcnt[11:8] -void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, +void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt); +Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded); + /// \returns \p Waitcnt with encoded \p Vmcnt for given isa \p Version. -unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, +unsigned encodeVmcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned Vmcnt); /// \returns \p Waitcnt with encoded \p Expcnt for given isa \p Version. -unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, +unsigned encodeExpcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned Expcnt); /// \returns \p Waitcnt with encoded \p Lgkmcnt for given isa \p Version. -unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, +unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned Lgkmcnt); /// Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa @@ -322,9 +362,11 @@ unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt, /// /// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given /// isa \p Version. -unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version, +unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt); +unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded); + unsigned getInitialPSInputAddr(const Function &F); LLVM_READNONE @@ -349,6 +391,7 @@ inline bool isKernel(CallingConv::ID CC) { } bool hasXNACK(const MCSubtargetInfo &STI); +bool hasSRAMECC(const MCSubtargetInfo &STI); bool hasMIMG_R128(const MCSubtargetInfo &STI); bool hasPackedD16(const MCSubtargetInfo &STI); @@ -447,6 +490,9 @@ int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); /// not the encoded offset. bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset); +bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, + const GCNSubtarget *Subtarget, uint32_t Align = 4); + /// \returns true if the intrinsic is divergent bool isIntrinsicSourceOfDivergence(unsigned IntrID); diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp deleted file mode 100644 index 1924f71f11c84..0000000000000 --- a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp +++ /dev/null @@ -1,75 +0,0 @@ -//===-- AMDGPULaneDominator.cpp - Determine Lane Dominators ---------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// MBB A lane-dominates MBB B if -// 1. A dominates B in the usual sense, i.e. every path from the entry to B -// goes through A, and -// 2. whenever B executes, every active lane during that execution of B was -// also active during the most recent execution of A. -// -// The simplest example where A dominates B but does not lane-dominate it is -// where A is a loop: -// -// | -// +--+ -// A | -// +--+ -// | -// B -// -// Unfortunately, the second condition is not fully captured by the control -// flow graph when it is unstructured (as may happen when branch conditions are -// uniform). -// -// The following replacement of the second condition is a conservative -// approximation. It is an equivalent condition when the CFG is fully -// structured: -// -// 2'. every cycle in the CFG that contains A also contains B. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPULaneDominator.h" - -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/MachineBasicBlock.h" - -namespace llvm { - -namespace AMDGPU { - -// Given machine basic blocks A and B where A dominates B, check whether -// A lane-dominates B. -// -// The check is conservative, i.e. there can be false-negatives. -bool laneDominates(MachineBasicBlock *A, MachineBasicBlock *B) { - // Check whether A is reachable from itself without going through B. - DenseSet<MachineBasicBlock *> Reachable; - SmallVector<MachineBasicBlock *, 8> Stack; - - Stack.push_back(A); - do { - MachineBasicBlock *MBB = Stack.back(); - Stack.pop_back(); - - for (MachineBasicBlock *Succ : MBB->successors()) { - if (Succ == A) - return false; - if (Succ != B && Reachable.insert(Succ).second) - Stack.push_back(Succ); - } - } while (!Stack.empty()); - - return true; -} - -} // namespace AMDGPU - -} // namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h deleted file mode 100644 index 4f33a89a364bd..0000000000000 --- a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h +++ /dev/null @@ -1,24 +0,0 @@ -//===- AMDGPULaneDominator.h ------------------------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H -#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H - -namespace llvm { - -class MachineBasicBlock; - -namespace AMDGPU { - -bool laneDominates(MachineBasicBlock *MBBA, MachineBasicBlock *MBBB); - -} // end namespace AMDGPU -} // end namespace llvm - -#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h index 9f0a4d29b5e43..82ffdef8e674a 100644 --- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h @@ -46,6 +46,7 @@ int64_t Value = 0; \ if (!expectAbsExpression(MCParser, Value, Err)) \ return false; \ + C.compute_pgm_resource_registers &= ~(SetMacro(0xFFFFFFFFFFFFFFFFULL) << Shift); \ C.compute_pgm_resource_registers |= SetMacro(Value) << Shift; \ return true; \ } diff --git a/lib/Target/AMDGPU/Utils/CMakeLists.txt b/lib/Target/AMDGPU/Utils/CMakeLists.txt index c5ed32e46821b..01b80ebe8d3dc 100644 --- a/lib/Target/AMDGPU/Utils/CMakeLists.txt +++ b/lib/Target/AMDGPU/Utils/CMakeLists.txt @@ -2,5 +2,4 @@ add_llvm_library(LLVMAMDGPUUtils AMDGPUBaseInfo.cpp AMDKernelCodeTUtils.cpp AMDGPUAsmUtils.cpp - AMDGPULaneDominator.cpp ) diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td index 4c7a92219755b..68446ab79720a 100644 --- a/lib/Target/AMDGPU/VOP1Instructions.td +++ b/lib/Target/AMDGPU/VOP1Instructions.td @@ -84,6 +84,10 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : let AsmMatchConverter = "cvtSdwaVOP1"; } +class VOP1_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : + VOP_DPP_Pseudo <OpName, P, pattern> { +} + class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { list<dag> ret = !if(P.HasModifiers, @@ -103,6 +107,8 @@ multiclass VOP1Inst <string opName, VOPProfile P, def _e32 : VOP1_Pseudo <opName, P>; def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>; def _sdwa : VOP1_SDWA_Pseudo <opName, P>; + foreach _ = BoolToList<P.HasExtDPP>.ret in + def _dpp : VOP1_DPP_Pseudo <opName, P>; } // Special profile for instructions which have clamp @@ -173,7 +179,9 @@ defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>; defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>; defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>; defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>; +let FPDPRounding = 1 in { defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>; +} // End FPDPRounding = 1 defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>; defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>; defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>; @@ -226,7 +234,9 @@ defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>; let SchedRW = [WriteDoubleAdd] in { defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>; defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>; +let FPDPRounding = 1 in { defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>; +} // End FPDPRounding = 1 } // End SchedRW = [WriteDoubleAdd] defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>; @@ -242,7 +252,9 @@ def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> { let Src0RC64 = VRegSrc_32; let HasExt = 0; - let HasSDWA9 = 0; + let HasExtDPP = 0; + let HasExtSDWA = 0; + let HasExtSDWA9 = 0; } // Special case because there are no true output operands. Hack vdst @@ -271,7 +283,10 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> { let AsmSDWA9 = getAsmSDWA9<1, 0, 1>.ret; let HasExt = 0; - let HasSDWA9 = 0; + let HasExtDPP = 0; + let HasExtSDWA = 0; + let HasExtSDWA9 = 0; + let HasDst = 0; let EmitDst = 1; // force vdst emission } @@ -328,8 +343,10 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>; let SubtargetPredicate = Has16BitInsts in { +let FPDPRounding = 1 in { defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>; defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>; +} // End FPDPRounding = 1 defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>; defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>; let SchedRW = [WriteQuarterRate32] in { @@ -347,7 +364,9 @@ defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>; defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>; defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>; defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>; +let FPDPRounding = 1 in { defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>; +} // End FPDPRounding = 1 } @@ -495,13 +514,8 @@ defm V_EXP_LEGACY_F32 : VOP1_Real_ci <0x46>; // VI //===----------------------------------------------------------------------===// -class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> : - VOP_DPP <ps.OpName, P> { - let Defs = ps.Defs; - let Uses = ps.Uses; - let SchedRW = ps.SchedRW; - let hasSideEffects = ps.hasSideEffects; - +class VOP1_DPPe <bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : + VOP_DPPe <P> { bits<8> vdst; let Inst{8-0} = 0xfa; // dpp let Inst{16-9} = op; @@ -539,9 +553,10 @@ multiclass VOP1_Real_vi <bits<10> op> { VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; - // For now left dpp only for asm/dasm - // TODO: add corresponding pseudo - def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>; + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_vi : + VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>, + VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>; } defm V_NOP : VOP1_Real_vi <0x0>; @@ -712,9 +727,11 @@ multiclass VOP1_Real_gfx9 <bits<10> op> { VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>, VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; - // For now left dpp only for asm/dasm - // TODO: add corresponding pseudo - def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>; + foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx9 : + VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, + VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>; + } defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>; diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index 5ec1a15c5cd20..e3fd7b5f9fadd 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -105,6 +105,11 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : let AsmMatchConverter = "cvtSdwaVOP2"; } +class VOP2_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : + VOP_DPP_Pseudo <OpName, P, pattern> { +} + + class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { list<dag> ret = !if(P.HasModifiers, [(set P.DstVT:$vdst, @@ -116,22 +121,49 @@ class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]); } -multiclass VOP2Inst <string opName, - VOPProfile P, - SDPatternOperator node = null_frag, - string revOp = opName, - bit GFX9Renamed = 0> { - +multiclass VOP2Inst_e32<string opName, + VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName, + bit GFX9Renamed = 0> { let renamedInGFX9 = GFX9Renamed in { - - def _e32 : VOP2_Pseudo <opName, P>, + def _e32 : VOP2_Pseudo <opName, P, VOPPatOrNull<node,P>.ret>, Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; + } // End renamedInGFX9 = GFX9Renamed +} +multiclass VOP2Inst_e64<string opName, + VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName, + bit GFX9Renamed = 0> { + let renamedInGFX9 = GFX9Renamed in { def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; + } // End renamedInGFX9 = GFX9Renamed +} - def _sdwa : VOP2_SDWA_Pseudo <opName, P>; +multiclass VOP2Inst_sdwa<string opName, + VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName, + bit GFX9Renamed = 0> { + let renamedInGFX9 = GFX9Renamed in { + def _sdwa : VOP2_SDWA_Pseudo <opName, P>; + } // End renamedInGFX9 = GFX9Renamed +} +multiclass VOP2Inst<string opName, + VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName, + bit GFX9Renamed = 0> : + VOP2Inst_e32<opName, P, node, revOp, GFX9Renamed>, + VOP2Inst_e64<opName, P, node, revOp, GFX9Renamed>, + VOP2Inst_sdwa<opName, P, node, revOp, GFX9Renamed> { + let renamedInGFX9 = GFX9Renamed in { + foreach _ = BoolToList<P.HasExtDPP>.ret in + def _dpp : VOP2_DPP_Pseudo <opName, P>; } } @@ -144,12 +176,14 @@ multiclass VOP2bInst <string opName, let renamedInGFX9 = GFX9Renamed in { let SchedRW = [Write32Bit, WriteSALU] in { let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in { - def _e32 : VOP2_Pseudo <opName, P>, + def _e32 : VOP2_Pseudo <opName, P, VOPPatOrNull<node,P>.ret>, Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; def _sdwa : VOP2_SDWA_Pseudo <opName, P> { let AsmMatchConverter = "cvtSdwaVOP2b"; } + foreach _ = BoolToList<P.HasExtDPP>.ret in + def _dpp : VOP2_DPP_Pseudo <opName, P>; } def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, @@ -172,6 +206,9 @@ multiclass VOP2eInst <string opName, def _sdwa : VOP2_SDWA_Pseudo <opName, P> { let AsmMatchConverter = "cvtSdwaVOP2b"; } + + foreach _ = BoolToList<P.HasExtDPP>.ret in + def _dpp : VOP2_DPP_Pseudo <opName, P>; } def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, @@ -211,9 +248,9 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3, 0, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; - let InsDPP = (ins DstRCDPP:$old, - Src0ModDPP:$src0_modifiers, Src0DPP:$src0, + let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, + VGPR_32:$src2, // stub argument dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); @@ -230,21 +267,15 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret; let HasSrc2 = 0; let HasSrc2Mods = 0; - let HasExt = 1; - let HasSDWA9 = 0; -} -def VOP_MAC_F16 : VOP_MAC <f16> { - // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives - // 'not a string initializer' error. - let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, f16>.ret; + let HasExt = 1; + let HasExtDPP = 1; + let HasExtSDWA = 1; + let HasExtSDWA9 = 0; } -def VOP_MAC_F32 : VOP_MAC <f32> { - // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives - // 'not a string initializer' error. - let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, f32>.ret; -} +def VOP_MAC_F16 : VOP_MAC <f16>; +def VOP_MAC_F32 : VOP_MAC <f32>; // Write out to vcc or arbitrary SGPR. def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> { @@ -290,7 +321,9 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); let HasExt = 1; - let HasSDWA9 = 1; + let HasExtDPP = 1; + let HasExtSDWA = 1; + let HasExtSDWA9 = 1; } // Read in from vcc or arbitrary SGPR @@ -321,7 +354,9 @@ def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); let HasExt = 1; - let HasSDWA9 = 1; + let HasExtDPP = 1; + let HasExtSDWA = 1; + let HasExtSDWA9 = 1; } def VOP_READLANE : VOPProfile<[i32, i32, i32]> { @@ -331,8 +366,11 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> { let Ins64 = Ins32; let Asm32 = " $vdst, $src0, $src1"; let Asm64 = Asm32; + let HasExt = 0; - let HasSDWA9 = 0; + let HasExtDPP = 0; + let HasExtSDWA = 0; + let HasExtSDWA9 = 0; } def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> { @@ -342,20 +380,23 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> { let Ins64 = Ins32; let Asm32 = " $vdst, $src0, $src1"; let Asm64 = Asm32; - let HasExt = 0; - let HasSDWA9 = 0; let HasSrc2 = 0; let HasSrc2Mods = 0; + + let HasExt = 0; + let HasExtDPP = 0; + let HasExtSDWA = 0; + let HasExtSDWA9 = 0; } //===----------------------------------------------------------------------===// // VOP2 Instructions //===----------------------------------------------------------------------===// -let SubtargetPredicate = isGCN in { +let SubtargetPredicate = isGCN, Predicates = [isGCN] in { defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>; -def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, [], "">; +def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>; let isCommutable = 1 in { defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>; @@ -363,29 +404,29 @@ defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, fsub>; defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">; defm V_MUL_LEGACY_F32 : VOP2Inst <"v_mul_legacy_f32", VOP_F32_F32_F32, AMDGPUfmul_legacy>; defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, fmul>; -defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32, AMDGPUmul_i24>; -defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_I32_I32_I32, AMDGPUmulhi_i24>; -defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32, AMDGPUmul_u24>; -defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_I32_I32_I32, AMDGPUmulhi_u24>; -defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum>; -defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum>; -defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_I32_I32_I32>; -defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_I32_I32_I32>; -defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_I32_I32_I32>; -defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_I32_I32_I32>; +defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmul_i24>; +defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_i24>; +defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmul_u24>; +defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_u24>; +defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum_like>; +defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum_like>; +defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smin>; +defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>; +defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>; +defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umax>; defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32">; defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32">; defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32">; -defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_I32_I32_I32>; -defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_I32_I32_I32>; -defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_I32_I32_I32>; +defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, and>; +defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>; +defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>; let Constraints = "$vdst = $src2", DisableEncoding="$src2", isConvertibleToThreeAddress = 1 in { defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>; } -def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, [], "">; +def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>; // No patterns so that the scalar instructions are always selected. // The scalar versions will be replaced with vector when needed later. @@ -411,11 +452,11 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32, null_frag, "v_sub // These are special and do not read the exec mask. let isConvergent = 1, Uses = []<Register> in { def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, - [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))], "">; + [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>; let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, - [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))], "">; + [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]>; } // End $vdst = $vdst_in, DisableEncoding $vdst_in } // End isConvergent = 1 @@ -425,13 +466,13 @@ defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32 defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>; defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>; defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst" -defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_i16_f32>; -defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_u16_f32>; -defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>; -defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_u16_u32>; -defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_i16_i32>; +defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_i16_f32>; +defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_u16_f32>; +defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_V2F16_F32_F32>, AMDGPUpkrtz_f16_f32>; +defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_u16_u32>; +defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_i16_i32>; -} // End SubtargetPredicate = isGCN +} // End SubtargetPredicate = isGCN, Predicates = [isGCN] def : GCNPat< (AMDGPUadde i32:$src0, i32:$src1, i1:$src2), @@ -444,40 +485,99 @@ def : GCNPat< >; // These instructions only exist on SI and CI -let SubtargetPredicate = isSICI in { +let SubtargetPredicate = isSICI, Predicates = [isSICI] in { defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmin_legacy>; defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>; let isCommutable = 1 in { defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>; -defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>; -defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>; -defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>; +defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, srl>; +defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, sra>; +defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, shl>; } // End isCommutable = 1 -} // End let SubtargetPredicate = SICI +} // End let SubtargetPredicate = SICI, Predicates = [isSICI] + +class DivergentBinOp<SDPatternOperator Op, VOP_Pseudo Inst> : + GCNPat< + (getDivergentFrag<Op>.ret Inst.Pfl.Src0VT:$src0, Inst.Pfl.Src1VT:$src1), + !if(!cast<Commutable_REV>(Inst).IsOrig, + (Inst $src0, $src1), + (Inst $src1, $src0) + ) + >; + +let AddedComplexity = 1 in { + def : DivergentBinOp<srl, V_LSHRREV_B32_e64>; + def : DivergentBinOp<sra, V_ASHRREV_I32_e64>; + def : DivergentBinOp<shl, V_LSHLREV_B32_e64>; +} + +let SubtargetPredicate = HasAddNoCarryInsts in { + def : DivergentBinOp<add, V_ADD_U32_e32>; + def : DivergentBinOp<sub, V_SUB_U32_e32>; + def : DivergentBinOp<sub, V_SUBREV_U32_e32>; +} + + +def : DivergentBinOp<add, V_ADD_I32_e32>; + +def : DivergentBinOp<add, V_ADD_I32_e64>; +def : DivergentBinOp<sub, V_SUB_I32_e32>; + +def : DivergentBinOp<sub, V_SUBREV_I32_e32>; + +def : DivergentBinOp<srl, V_LSHRREV_B32_e32>; +def : DivergentBinOp<sra, V_ASHRREV_I32_e32>; +def : DivergentBinOp<shl, V_LSHLREV_B32_e32>; +def : DivergentBinOp<adde, V_ADDC_U32_e32>; +def : DivergentBinOp<sube, V_SUBB_U32_e32>; + +class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> : + GCNPat< + (getDivergentFrag<Op>.ret i64:$src0, i64:$src1), + (REG_SEQUENCE VReg_64, + (Inst + (i32 (EXTRACT_SUBREG $src0, sub0)), + (i32 (EXTRACT_SUBREG $src1, sub0)) + ), sub0, + (Inst + (i32 (EXTRACT_SUBREG $src0, sub1)), + (i32 (EXTRACT_SUBREG $src1, sub1)) + ), sub1 + ) + >; + +def : divergent_i64_BinOp <and, V_AND_B32_e32>; +def : divergent_i64_BinOp <or, V_OR_B32_e32>; +def : divergent_i64_BinOp <xor, V_XOR_B32_e32>; let SubtargetPredicate = Has16BitInsts in { +let FPDPRounding = 1 in { def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">; +defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>; +} // End FPDPRounding = 1 + defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>; defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>; defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>; -defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>; let isCommutable = 1 in { +let FPDPRounding = 1 in { defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>; defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>; defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">; defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>; def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">; +} // End FPDPRounding = 1 defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>; defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>; defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">; defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>; -defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>; -defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>; +defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>; +defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>; defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>; defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>; defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>; @@ -698,13 +798,8 @@ defm V_CVT_PK_I16_I32 : VOP2_Real_e32e64_si <0x31>; // VI //===----------------------------------------------------------------------===// -class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, string OpName = ps.OpName, VOPProfile P = ps.Pfl> : - VOP_DPP <OpName, P> { - let Defs = ps.Defs; - let Uses = ps.Uses; - let SchedRW = ps.SchedRW; - let hasSideEffects = ps.hasSideEffects; - +class VOP2_DPPe <bits<6> op, VOP2_DPP_Pseudo ps, VOPProfile P = ps.Pfl> : + VOP_DPPe <P> { bits<8> vdst; bits<8> src1; let Inst{8-0} = 0xfa; //dpp @@ -716,12 +811,6 @@ class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, string OpName = ps.OpName, VOPProfil let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { -multiclass VOP32_Real_vi <bits<10> op> { - def _vi : - VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.VI>, - VOP3e_vi<op, !cast<VOP2_Pseudo>(NAME).Pfl>; -} - multiclass VOP2_Real_MADK_vi <bits<6> op> { def _vi : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.VI>, VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>; @@ -791,8 +880,13 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa"); let AsmString = AsmName # ps.AsmOperands; } - def _dpp : - VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName>; + foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_vi : + VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.VI>, + VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> { + VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp"); + let AsmString = AsmName # ps.AsmOperands; + } } } @@ -819,10 +913,14 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> { VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa"); let AsmString = AsmName # ps.AsmOperands; } - def _dpp_gfx9 : - VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName> { - let DecoderNamespace = "SDWA9"; - } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx9 : + VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.GFX9>, + VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> { + VOP2_DPP_Pseudo ps = !cast<VOP2_DPP_Pseudo>(OpName#"_dpp"); + let AsmString = AsmName # ps.AsmOperands; + let DecoderNamespace = "SDWA9"; + } } multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> { @@ -840,19 +938,23 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> { VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { } - def _dpp_gfx9 : - VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")> { - let DecoderNamespace = "SDWA9"; - } + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_gfx9 : + VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>, + VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> { + let DecoderNamespace = "SDWA9"; + } } } // AssemblerPredicates = [isGFX9] multiclass VOP2_Real_e32e64_vi <bits<6> op> : Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> { - // For now left dpp only for asm/dasm - // TODO: add corresponding pseudo - def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; + + foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in + def _dpp_vi : + VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>, + VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>; } defm V_CNDMASK_B32 : VOP2_Real_e32e64_vi <0x0>; @@ -899,9 +1001,6 @@ defm V_ADD_U32 : VOP2_Real_e32e64_gfx9 <0x34>; defm V_SUB_U32 : VOP2_Real_e32e64_gfx9 <0x35>; defm V_SUBREV_U32 : VOP2_Real_e32e64_gfx9 <0x36>; -defm V_READLANE_B32 : VOP32_Real_vi <0x289>; -defm V_WRITELANE_B32 : VOP32_Real_vi <0x28a>; - defm V_BFM_B32 : VOP2_Real_e64only_vi <0x293>; defm V_BCNT_U32_B32 : VOP2_Real_e64only_vi <0x28b>; defm V_MBCNT_LO_U32_B32 : VOP2_Real_e64only_vi <0x28c>; diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index 17ae08dc62670..4b8c1f208a0ed 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -17,16 +17,16 @@ class getVOP3ModPat<VOPProfile P, SDPatternOperator node> { (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)); list<dag> ret3 = [(set P.DstVT:$vdst, - (node (P.Src0VT src0), + (DivergentFragOrOp<node, P>.ret (P.Src0VT src0), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))]; list<dag> ret2 = [(set P.DstVT:$vdst, - (node (P.Src0VT src0), + (DivergentFragOrOp<node, P>.ret (P.Src0VT src0), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))]; list<dag> ret1 = [(set P.DstVT:$vdst, - (node (P.Src0VT src0)))]; + (DivergentFragOrOp<node, P>.ret (P.Src0VT src0)))]; list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, !if(!eq(P.NumSrcArgs, 2), ret2, @@ -35,18 +35,18 @@ class getVOP3ModPat<VOPProfile P, SDPatternOperator node> { class getVOP3PModPat<VOPProfile P, SDPatternOperator node> { list<dag> ret3 = [(set P.DstVT:$vdst, - (node (P.Src0VT !if(P.HasClamp, (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), + (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))), (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)), (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers))))]; list<dag> ret2 = [(set P.DstVT:$vdst, - (node !if(P.HasClamp, (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)), + (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)), (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))), (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers))))]; list<dag> ret1 = [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))]; + (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))]; list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, !if(!eq(P.NumSrcArgs, 2), ret2, @@ -55,18 +55,18 @@ class getVOP3PModPat<VOPProfile P, SDPatternOperator node> { class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> { list<dag> ret3 = [(set P.DstVT:$vdst, - (node (P.Src0VT !if(P.HasClamp, (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), + (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))), (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers)), (P.Src2VT (VOP3OpSel P.Src2VT:$src2, i32:$src2_modifiers))))]; list<dag> ret2 = [(set P.DstVT:$vdst, - (node !if(P.HasClamp, (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)), + (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)), (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))), (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers))))]; list<dag> ret1 = [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))]; + (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))]; list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, !if(!eq(P.NumSrcArgs, 2), ret2, @@ -75,18 +75,18 @@ class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> { class getVOP3OpSelModPat<VOPProfile P, SDPatternOperator node> { list<dag> ret3 = [(set P.DstVT:$vdst, - (node (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), + (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))), (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers)), (P.Src2VT (VOP3OpSelMods P.Src2VT:$src2, i32:$src2_modifiers))))]; list<dag> ret2 = [(set P.DstVT:$vdst, - (node !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)), + (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)), (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))), (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers))))]; list<dag> ret1 = [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))]; + (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))]; list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, !if(!eq(P.NumSrcArgs, 2), ret2, @@ -94,9 +94,9 @@ class getVOP3OpSelModPat<VOPProfile P, SDPatternOperator node> { } class getVOP3Pat<VOPProfile P, SDPatternOperator node> { - list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))]; - list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]; - list<dag> ret1 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]; + list<dag> ret3 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))]; + list<dag> ret2 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret P.Src0VT:$src0, P.Src1VT:$src1))]; + list<dag> ret1 = [(set P.DstVT:$vdst, (DivergentFragOrOp<node, P>.ret P.Src0VT:$src0))]; list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, !if(!eq(P.NumSrcArgs, 2), ret2, ret1)); @@ -185,6 +185,7 @@ class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProf getAsm64<HasDst, NumSrcArgs, HasIntClamp, HasModifiers, HasOMod, DstVT>.ret, P.Asm64)); + let NeedPatGen = P.NeedPatGen; } class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> { @@ -219,7 +220,8 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { // VOP3 INTERP //===----------------------------------------------------------------------===// -class VOP3Interp<string OpName, VOPProfile P> : VOP3_Pseudo<OpName, P> { +class VOP3Interp<string OpName, VOPProfile P, list<dag> pattern = []> : + VOP3_Pseudo<OpName, P, pattern> { let AsmMatchConverter = "cvtVOP3Interp"; } @@ -291,11 +293,13 @@ def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>; def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>; let SchedRW = [WriteDoubleAdd] in { +let FPDPRounding = 1 in { def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>; def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>; def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>; -def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>; -def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum, 1>; +} // End FPDPRounding = 1 +def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like, 1>; +def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>; } // End SchedRW = [WriteDoubleAdd] let SchedRW = [WriteQuarterRate32] in { @@ -323,6 +327,7 @@ def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC, def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, getVOP3VCC<VOP_F64_F64_F64_F64_VCC, AMDGPUdiv_fmas>.ret> { let SchedRW = [WriteDouble]; + let FPDPRounding = 1; } } // End Uses = [VCC, EXEC] @@ -353,10 +358,10 @@ def V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CL def V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>; def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>; -let SchedRW = [WriteDoubleAdd] in { +let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in { def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>; def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>; -} // End SchedRW = [WriteDoubleAdd] +} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1 def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> { let SchedRW = [WriteFloatFMA, WriteSALU]; @@ -367,6 +372,7 @@ def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> { let SchedRW = [WriteDouble, WriteSALU]; let AsmMatchConverter = ""; + let FPDPRounding = 1; } def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; @@ -381,12 +387,12 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I3 let SchedRW = [Write64Bit] in { // These instructions only exist on SI and CI -let SubtargetPredicate = isSICI in { -def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>>; -def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>>; -def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>>; +let SubtargetPredicate = isSICI, Predicates = [isSICI] in { +def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, shl>; +def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, srl>; +def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, sra>; def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; -} // End SubtargetPredicate = isSICI +} // End SubtargetPredicate = isSICI, Predicates = [isSICI] let SubtargetPredicate = isVI in { def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>>; @@ -395,6 +401,22 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>; } // End SubtargetPredicate = isVI } // End SchedRW = [Write64Bit] +let Predicates = [isVI] in { +def : GCNPat < + (getDivergentFrag<shl>.ret i64:$x, i32:$y), + (V_LSHLREV_B64 $y, $x) +>; +def : AMDGPUPat < + (getDivergentFrag<srl>.ret i64:$x, i32:$y), + (V_LSHRREV_B64 $y, $x) +>; +def : AMDGPUPat < + (getDivergentFrag<sra>.ret i64:$x, i32:$y), + (V_ASHRREV_I64 $y, $x) +>; +} + + let SubtargetPredicate = isCIVI in { let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in { @@ -414,33 +436,51 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup> { let Predicates = [Has16BitInsts, isVIOnly]; + let FPDPRounding = 1; } def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup> { let renamedInGFX9 = 1; let Predicates = [Has16BitInsts, isGFX9]; + let FPDPRounding = 1; +} + +def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma> { + let Predicates = [Has16BitInsts, isVIOnly]; + let FPDPRounding = 1; +} +def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, fma> { + let renamedInGFX9 = 1; + let Predicates = [Has16BitInsts, isGFX9]; + let FPDPRounding = 1; } let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in { let renamedInGFX9 = 1 in { -def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>; def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>; def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>; -def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>; +let FPDPRounding = 1 in { +def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>; +let Uses = [M0, EXEC] in { def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>; -} +} // End Uses = [M0, EXEC] +} // End FPDPRounding = 1 +} // End renamedInGFX9 = 1 let SubtargetPredicate = isGFX9 in { -def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>; +def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>> { + let FPDPRounding = 1; +} def V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>; def V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>; -def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>; def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>; } // End SubtargetPredicate = isGFX9 +let Uses = [M0, EXEC], FPDPRounding = 1 in { def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>; def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>; +} // End Uses = [M0, EXEC], FPDPRounding = 1 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1 @@ -461,17 +501,6 @@ def : GCNPat < (inst i16:$src0, i16:$src1, i16:$src2, (i1 0)) >; -def : GCNPat< - (i32 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))), - (inst i16:$src0, i16:$src1, i16:$src2, (i1 0)) ->; - -def : GCNPat< - (i64 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))), - (REG_SEQUENCE VReg_64, - (inst i16:$src0, i16:$src1, i16:$src2, (i1 0)), sub0, - (V_MOV_B32_e32 (i32 0)), sub1) ->; } defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>; @@ -479,6 +508,37 @@ defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>; } // End Predicates = [Has16BitInsts] +class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag< + (ops node:$x, node:$y, node:$z), + // When the inner operation is used multiple times, selecting 3-op + // instructions may still be beneficial -- if the other users can be + // combined similarly. Let's be conservative for now. + (op2 (HasOneUseBinOp<op1> node:$x, node:$y), node:$z), + [{ + // Only use VALU ops when the result is divergent. + if (!N->isDivergent()) + return false; + + // Check constant bus limitations. + // + // Note: Use !isDivergent as a conservative proxy for whether the value + // is in an SGPR (uniform values can end up in VGPRs as well). + unsigned ConstantBusUses = 0; + for (unsigned i = 0; i < 3; ++i) { + if (!Operands[i]->isDivergent() && + !isInlineImmediate(Operands[i].getNode())) { + ConstantBusUses++; + if (ConstantBusUses >= 2) + return false; + } + } + + return true; + }] +> { + let PredicateCodeUsesOperands = 1; +} + let SubtargetPredicate = isGFX9 in { def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>; def V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; @@ -513,6 +573,22 @@ def V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B3 def V_ADD_I32_gfx9 : VOP3Inst <"v_add_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>; def V_SUB_I32_gfx9 : VOP3Inst <"v_sub_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>; + + +class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat < + // This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions. + (ThreeOpFrag<op1, op2> i32:$src0, i32:$src1, i32:$src2), + (inst i32:$src0, i32:$src1, i32:$src2) +>; + +def : ThreeOp_i32_Pats<shl, add, V_LSHL_ADD_U32>; +def : ThreeOp_i32_Pats<add, shl, V_ADD_LSHL_U32>; +def : ThreeOp_i32_Pats<add, add, V_ADD3_U32>; +def : ThreeOp_i32_Pats<shl, or, V_LSHL_OR_B32>; +def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32>; +def : ThreeOp_i32_Pats<or, or, V_OR3_B32>; +def : ThreeOp_i32_Pats<xor, add, V_XAD_U32>; + } // End SubtargetPredicate = isGFX9 //===----------------------------------------------------------------------===// @@ -662,23 +738,23 @@ defm V_MAD_I64_I32 : VOP3be_Real_ci <0x177>; let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in { multiclass VOP3_Real_vi<bits<10> op> { - def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, - VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>; + def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>, + VOP3e_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>; } multiclass VOP3be_Real_vi<bits<10> op> { - def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, - VOP3be_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>; + def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>, + VOP3be_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>; } multiclass VOP3OpSel_Real_gfx9<bits<10> op> { - def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, - VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(NAME).Pfl>; + def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>, + VOP3OpSel_gfx9 <op, !cast<VOP_Pseudo>(NAME).Pfl>; } multiclass VOP3Interp_Real_vi<bits<10> op> { - def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, - VOP3Interp_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>; + def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>, + VOP3Interp_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>; } } // End AssemblerPredicates = [isVI], DecoderNamespace = "VI" @@ -786,12 +862,15 @@ defm V_FMA_F16 : VOP3_F16_Real_vi <0x1ee>; defm V_DIV_FIXUP_F16 : VOP3_F16_Real_vi <0x1ef>; defm V_INTERP_P2_F16 : VOP3Interp_F16_Real_vi <0x276>; +let FPDPRounding = 1 in { defm V_MAD_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ea, "V_MAD_F16", "v_mad_legacy_f16">; -defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">; -defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">; defm V_FMA_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ee, "V_FMA_F16", "v_fma_legacy_f16">; defm V_DIV_FIXUP_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ef, "V_DIV_FIXUP_F16", "v_div_fixup_legacy_f16">; defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16", "v_interp_p2_legacy_f16">; +} // End FPDPRounding = 1 + +defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">; +defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">; defm V_MAD_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">; defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">; @@ -824,6 +903,9 @@ defm V_MUL_LO_I32 : VOP3_Real_vi <0x285>; defm V_MUL_HI_U32 : VOP3_Real_vi <0x286>; defm V_MUL_HI_I32 : VOP3_Real_vi <0x287>; +defm V_READLANE_B32 : VOP3_Real_vi <0x289>; +defm V_WRITELANE_B32 : VOP3_Real_vi <0x28a>; + defm V_LSHLREV_B64 : VOP3_Real_vi <0x28f>; defm V_LSHRREV_B64 : VOP3_Real_vi <0x290>; defm V_ASHRREV_I64 : VOP3_Real_vi <0x291>; diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td index b51828b546797..91b45583c8489 100644 --- a/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/lib/Target/AMDGPU/VOP3PInstructions.td @@ -42,14 +42,16 @@ class VOP3_VOP3PInst<string OpName, VOPProfile P, bit UseTiedOutput = 0, } let isCommutable = 1 in { -def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>; def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; +let FPDPRounding = 1 in { +def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>; def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>; def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>; -def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>; -def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum>; +} // End FPDPRounding = 1 +def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>; +def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>; def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>; def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; @@ -137,12 +139,14 @@ let SubtargetPredicate = HasMadMixInsts in { let isCommutable = 1 in { def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; +let FPDPRounding = 1 in { // Clamp modifier is applied after conversion to f16. def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; let ClampLo = 0, ClampHi = 1 in { def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; } +} // End FPDPRounding = 1 } defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>; @@ -154,18 +158,99 @@ let SubtargetPredicate = HasFmaMixInsts in { let isCommutable = 1 in { def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; +let FPDPRounding = 1 in { // Clamp modifier is applied after conversion to f16. def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; let ClampLo = 0, ClampHi = 1 in { def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; } +} // End FPDPRounding = 1 } defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>; } -let SubtargetPredicate = HasDLInsts in { +// Defines patterns that extract signed 4bit from each Idx[0]. +foreach Idx = [[0,28],[4,24],[8,20],[12,16],[16,12],[20,8],[24,4]] in + def ExtractSigned4bit_#Idx[0] : PatFrag<(ops node:$src), + (sra (shl node:$src, (i32 Idx[1])), (i32 28))>; + +// Defines code pattern that extracts U(unsigned/signed) 4/8bit from FromBitIndex. +class Extract<int FromBitIndex, int BitMask, bit U>: PatFrag< + (ops node:$src), + !if (!or (!and (!eq (BitMask, 255), !eq (FromBitIndex, 24)), !eq (FromBitIndex, 28)), // last element + !if (U, (srl node:$src, (i32 FromBitIndex)), (sra node:$src, (i32 FromBitIndex))), + !if (!eq (FromBitIndex, 0), // first element + !if (U, (and node:$src, (i32 BitMask)), + !if (!eq (BitMask, 15), (!cast<PatFrag>("ExtractSigned4bit_"#FromBitIndex) node:$src), + (sext_inreg node:$src, i8))), + !if (U, (and (srl node:$src, (i32 FromBitIndex)), (i32 BitMask)), + !if (!eq (BitMask, 15), (!cast<PatFrag>("ExtractSigned4bit_"#FromBitIndex) node:$src), + (sext_inreg (srl node:$src, (i32 FromBitIndex)), i8)))))>; + + +foreach Type = ["I", "U"] in + foreach Index = 0-3 in { + // Defines patterns that extract each Index'ed 8bit from an unsigned + // 32bit scalar value; + def #Type#Index#"_8bit" : Extract<!shl(Index, 3), 255, !if (!eq (Type, "U"), 1, 0)>; + + // Defines multiplication patterns where the multiplication is happening on each + // Index'ed 8bit of a 32bit scalar value. + + def Mul#Type#_Elt#Index : PatFrag< + (ops node:$src0, node:$src1), + (!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), AMDGPUmul_i24_oneuse, AMDGPUmul_u24_oneuse)) + (!cast<Extract>(#Type#Index#"_8bit") node:$src0), + (!cast<Extract>(#Type#Index#"_8bit") node:$src1))>; + } + +// Different variants of dot8 patterns cause a huge increase in the compile time. +// Define non-associative/commutative add/mul to prevent permutation in the dot8 +// pattern. +def NonACAdd : SDNode<"ISD::ADD" , SDTIntBinOp>; +def NonACAdd_oneuse : HasOneUseBinOp<NonACAdd>; + +def NonACAMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24" , SDTIntBinOp>; +def NonACAMDGPUmul_u24_oneuse : HasOneUseBinOp<NonACAMDGPUmul_u24>; + +def NonACAMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24" , SDTIntBinOp>; +def NonACAMDGPUmul_i24_oneuse : HasOneUseBinOp<NonACAMDGPUmul_i24>; + +foreach Type = ["I", "U"] in + foreach Index = 0-7 in { + // Defines patterns that extract each Index'ed 4bit from an unsigned + // 32bit scalar value; + def #Type#Index#"_4bit" : Extract<!shl(Index, 2), 15, !if (!eq (Type, "U"), 1, 0)>; + + // Defines multiplication patterns where the multiplication is happening on each + // Index'ed 8bit of a 32bit scalar value. + def Mul#Type#Index#"_4bit" : PatFrag< + (ops node:$src0, node:$src1), + (!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), NonACAMDGPUmul_i24_oneuse, NonACAMDGPUmul_u24_oneuse)) + (!cast<Extract>(#Type#Index#"_4bit") node:$src0), + (!cast<Extract>(#Type#Index#"_4bit") node:$src1))>; + } + +class UDot2Pat<Instruction Inst> : GCNPat < + (add (add_oneuse (AMDGPUmul_u24_oneuse (srl i32:$src0, (i32 16)), + (srl i32:$src1, (i32 16))), i32:$src2), + (AMDGPUmul_u24_oneuse (and i32:$src0, (i32 65535)), + (and i32:$src1, (i32 65535))) + ), + (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0)) +>; + +class SDot2Pat<Instruction Inst> : GCNPat < + (add (add_oneuse (AMDGPUmul_i24_oneuse (sra i32:$src0, (i32 16)), + (sra i32:$src1, (i32 16))), i32:$src2), + (AMDGPUmul_i24_oneuse (sext_inreg i32:$src0, i16), + (sext_inreg i32:$src1, i16))), + (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0)) +>; + +let SubtargetPredicate = HasDotInsts in { def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>; def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>; @@ -192,7 +277,32 @@ defm : DotPats<int_amdgcn_udot4, V_DOT4_U32_U8>; defm : DotPats<int_amdgcn_sdot8, V_DOT8_I32_I4>; defm : DotPats<int_amdgcn_udot8, V_DOT8_U32_U4>; -} // End SubtargetPredicate = HasDLInsts +def : UDot2Pat<V_DOT2_U32_U16>; +def : SDot2Pat<V_DOT2_I32_I16>; + +foreach Type = ["U", "I"] in + def : GCNPat < + !cast<dag>(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y, + (add_oneuse lhs, (!cast<PatFrag>("Mul"#Type#"_Elt"#y) i32:$src0, i32:$src1)))), + (!cast<VOP3PInst>("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; + +foreach Type = ["U", "I"] in + def : GCNPat < + !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), + [1, 2, 3, 4, 5, 6, 7], lhs, y, + (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))), + (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; + +// Different variants of dot8 code-gen dag patterns are not generated through table-gen due to a huge increase +// in the compile time. Directly handle the pattern generated by the FE here. +foreach Type = ["U", "I"] in + def : GCNPat < + !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), + [7, 1, 2, 3, 4, 5, 6], lhs, y, + (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))), + (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; + +} // End SubtargetPredicate = HasDotInsts multiclass VOP3P_Real_vi<bits<10> op> { def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, @@ -242,7 +352,7 @@ defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>; } -let SubtargetPredicate = HasDLInsts in { +let SubtargetPredicate = HasDotInsts in { defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>; defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>; @@ -252,4 +362,4 @@ defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x3a9>; defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x3aa>; defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x3ab>; -} // End SubtargetPredicate = HasDLInsts +} // End SubtargetPredicate = HasDotInsts diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td index cc6b8116afee1..091cac8cd35ca 100644 --- a/lib/Target/AMDGPU/VOPCInstructions.td +++ b/lib/Target/AMDGPU/VOPCInstructions.td @@ -635,6 +635,17 @@ def : ICMP_Pattern <COND_SGE, V_CMP_GE_I64_e64, i64>; def : ICMP_Pattern <COND_SLT, V_CMP_LT_I64_e64, i64>; def : ICMP_Pattern <COND_SLE, V_CMP_LE_I64_e64, i64>; +def : ICMP_Pattern <COND_EQ, V_CMP_EQ_U16_e64, i16>; +def : ICMP_Pattern <COND_NE, V_CMP_NE_U16_e64, i16>; +def : ICMP_Pattern <COND_UGT, V_CMP_GT_U16_e64, i16>; +def : ICMP_Pattern <COND_UGE, V_CMP_GE_U16_e64, i16>; +def : ICMP_Pattern <COND_ULT, V_CMP_LT_U16_e64, i16>; +def : ICMP_Pattern <COND_ULE, V_CMP_LE_U16_e64, i16>; +def : ICMP_Pattern <COND_SGT, V_CMP_GT_I16_e64, i16>; +def : ICMP_Pattern <COND_SGE, V_CMP_GE_I16_e64, i16>; +def : ICMP_Pattern <COND_SLT, V_CMP_LT_I16_e64, i16>; +def : ICMP_Pattern <COND_SLE, V_CMP_LE_I16_e64, i16>; + class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : GCNPat < (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), @@ -656,6 +667,14 @@ def : FCMP_Pattern <COND_OGE, V_CMP_GE_F64_e64, f64>; def : FCMP_Pattern <COND_OLT, V_CMP_LT_F64_e64, f64>; def : FCMP_Pattern <COND_OLE, V_CMP_LE_F64_e64, f64>; +def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_e64, f16>; +def : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F16_e64, f16>; +def : FCMP_Pattern <COND_OGT, V_CMP_GT_F16_e64, f16>; +def : FCMP_Pattern <COND_OGE, V_CMP_GE_F16_e64, f16>; +def : FCMP_Pattern <COND_OLT, V_CMP_LT_F16_e64, f16>; +def : FCMP_Pattern <COND_OLE, V_CMP_LE_F16_e64, f16>; + + def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F32_e64, f32>; def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F32_e64, f32>; def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F32_e64, f32>; @@ -670,6 +689,13 @@ def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F64_e64, f64>; def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>; def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>; +def : FCMP_Pattern <COND_UEQ, V_CMP_NLG_F16_e64, f16>; +def : FCMP_Pattern <COND_UNE, V_CMP_NEQ_F16_e64, f16>; +def : FCMP_Pattern <COND_UGT, V_CMP_NLE_F16_e64, f16>; +def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F16_e64, f16>; +def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F16_e64, f16>; +def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_e64, f16>; + //===----------------------------------------------------------------------===// // Target //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td index f0f7f259f71d2..7de7d90d27b3a 100644 --- a/lib/Target/AMDGPU/VOPInstructions.td +++ b/lib/Target/AMDGPU/VOPInstructions.td @@ -420,10 +420,10 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> : let SDWA = 1; let Uses = [EXEC]; - let SubtargetPredicate = !if(P.HasExt, HasSDWA, DisableInst); - let AssemblerPredicate = !if(P.HasExt, HasSDWA, DisableInst); - let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.SDWA, - AMDGPUAsmVariants.Disable); + let SubtargetPredicate = !if(P.HasExtSDWA, HasSDWA, DisableInst); + let AssemblerPredicate = !if(P.HasExtSDWA, HasSDWA, DisableInst); + let AsmVariantName = !if(P.HasExtSDWA, AMDGPUAsmVariants.SDWA, + AMDGPUAsmVariants.Disable); let DecoderNamespace = "SDWA"; VOPProfile Pfl = P; @@ -471,10 +471,10 @@ class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> : let Constraints = ps.Constraints; let DisableEncoding = ps.DisableEncoding; - let SubtargetPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst); - let AssemblerPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst); - let AsmVariantName = !if(ps.Pfl.HasSDWA9, AMDGPUAsmVariants.SDWA9, - AMDGPUAsmVariants.Disable); + let SubtargetPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA9, DisableInst); + let AssemblerPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA9, DisableInst); + let AsmVariantName = !if(ps.Pfl.HasExtSDWA9, AMDGPUAsmVariants.SDWA9, + AMDGPUAsmVariants.Disable); let DecoderNamespace = "SDWA9"; // Copy relevant pseudo op flags @@ -505,9 +505,14 @@ class VOP_DPPe<VOPProfile P> : Enc64 { let Inst{63-60} = row_mask; } -class VOP_DPP <string OpName, VOPProfile P> : - InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, []>, - VOP_DPPe<P> { +class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : + InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, pattern>, + VOP <OpName>, + SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE>, + MnemonicAlias <OpName#"_dpp", OpName> { + + let isPseudo = 1; + let isCodeGenOnly = 1; let mayLoad = 0; let mayStore = 0; @@ -517,15 +522,99 @@ class VOP_DPP <string OpName, VOPProfile P> : let VALU = 1; let DPP = 1; let Size = 8; + let Uses = [EXEC]; + let isConvergent = 1; + + string Mnemonic = OpName; + string AsmOperands = P.AsmDPP; let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", ""); let SubtargetPredicate = HasDPP; - let AssemblerPredicate = !if(P.HasExt, HasDPP, DisableInst); - let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP, - AMDGPUAsmVariants.Disable); + let AssemblerPredicate = !if(P.HasExtDPP, HasDPP, DisableInst); + let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP, + AMDGPUAsmVariants.Disable); let Constraints = !if(P.NumSrcArgs, "$old = $vdst", ""); let DisableEncoding = !if(P.NumSrcArgs, "$old", ""); let DecoderNamespace = "DPP"; + + VOPProfile Pfl = P; +} + +class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> : + InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>, + SIMCInstr <ps.PseudoInstr, EncodingFamily> { + + let isPseudo = 0; + let isCodeGenOnly = 0; + + let Defs = ps.Defs; + let Uses = ps.Uses; + let SchedRW = ps.SchedRW; + let hasSideEffects = ps.hasSideEffects; + + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + + // Copy relevant pseudo op flags + let isConvergent = ps.isConvergent; + let SubtargetPredicate = ps.SubtargetPredicate; + let AssemblerPredicate = ps.AssemblerPredicate; + let AsmMatchConverter = ps.AsmMatchConverter; + let AsmVariantName = ps.AsmVariantName; + let UseNamedOperandTable = ps.UseNamedOperandTable; + let DecoderNamespace = ps.DecoderNamespace; + let Constraints = ps.Constraints; + let DisableEncoding = ps.DisableEncoding; + let TSFlags = ps.TSFlags; +} + +class getNumNodeArgs<SDPatternOperator Op> { + SDNode N = !cast<SDNode>(Op); + SDTypeProfile TP = N.TypeProfile; + int ret = TP.NumOperands; +} + + +class getDivergentFrag<SDPatternOperator Op> { + + int NumSrcArgs = getNumNodeArgs<Op>.ret; + PatFrag ret = PatFrag < + !if(!eq(NumSrcArgs, 1), + (ops node:$src0), + !if(!eq(NumSrcArgs, 2), + (ops node:$src0, node:$src1), + (ops node:$src0, node:$src1, node:$src2))), + !if(!eq(NumSrcArgs, 1), + (Op $src0), + !if(!eq(NumSrcArgs, 2), + (Op $src0, $src1), + (Op $src0, $src1, $src2))), + [{ return N->isDivergent(); }] + >; +} + +class VOPPatGen<SDPatternOperator Op, VOPProfile P> { + + PatFrag Operator = getDivergentFrag < Op >.ret; + + dag Ins = !foreach(tmp, P.Ins32, !subst(ins, Operator, + !subst(P.Src0RC32, P.Src0VT, + !subst(P.Src1RC32, P.Src1VT, tmp)))); + + + dag Outs = !foreach(tmp, P.Outs32, !subst(outs, set, + !subst(P.DstRC, P.DstVT, tmp))); + + list<dag> ret = [!con(Outs, (set Ins))]; +} + +class VOPPatOrNull<SDPatternOperator Op, VOPProfile P> { + list<dag> ret = !if(!ne(P.NeedPatGen,PatGenMode.NoPattern), VOPPatGen<Op, P>.ret, []); +} + +class DivergentFragOrOp<SDPatternOperator Op, VOPProfile P> { + SDPatternOperator ret = !if(!eq(P.NeedPatGen,PatGenMode.Pattern), + !if(!isa<SDNode>(Op), getDivergentFrag<Op>.ret, Op), Op); } include "VOPCInstructions.td" |
