diff options
Diffstat (limited to 'lib/Target/AMDGPU')
148 files changed, 18452 insertions, 4821 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 568682899be5..0ddc43ad5033 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -23,6 +23,7 @@ class ModulePass; class Pass; class Target; class TargetMachine; +class TargetOptions; class PassRegistry; class Module; @@ -34,6 +35,7 @@ FunctionPass *createR600ClauseMergePass(); FunctionPass *createR600Packetizer(); FunctionPass *createR600ControlFlowFinalizer(); FunctionPass *createAMDGPUCFGStructurizerPass(); +FunctionPass *createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel); // SI Passes FunctionPass *createSIAnnotateControlFlowPass(); @@ -44,12 +46,20 @@ FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(); FunctionPass *createSIWholeQuadModePass(); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); +FunctionPass *createSIOptimizeExecMaskingPreRAPass(); FunctionPass *createSIFixSGPRCopiesPass(); +FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIDebuggerInsertNopsPass(); FunctionPass *createSIInsertWaitsPass(); FunctionPass *createSIInsertWaitcntsPass(); +FunctionPass *createSIFixWWMLivenessPass(); +FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &); +FunctionPass *createAMDGPUUseNativeCallsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); +FunctionPass *createAMDGPURewriteOutArgumentsPass(); + +void initializeAMDGPUDAGToDAGISelPass(PassRegistry&); void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&); extern char &AMDGPUMachineCFGStructurizerID; @@ -64,6 +74,24 @@ ModulePass *createAMDGPULowerIntrinsicsPass(); void initializeAMDGPULowerIntrinsicsPass(PassRegistry &); extern char &AMDGPULowerIntrinsicsID; +void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &); +extern char &AMDGPURewriteOutArgumentsID; + +void initializeR600ClauseMergePassPass(PassRegistry &); +extern char &R600ClauseMergePassID; + +void initializeR600ControlFlowFinalizerPass(PassRegistry &); +extern char &R600ControlFlowFinalizerID; + +void initializeR600ExpandSpecialInstrsPassPass(PassRegistry &); +extern char &R600ExpandSpecialInstrsPassID; + +void initializeR600VectorRegMergerPass(PassRegistry &); +extern char &R600VectorRegMergerID; + +void initializeR600PacketizerPass(PassRegistry &); +extern char &R600PacketizerID; + void initializeSIFoldOperandsPass(PassRegistry &); extern char &SIFoldOperandsID; @@ -97,14 +125,24 @@ extern char &SIInsertSkipsPassID; void initializeSIOptimizeExecMaskingPass(PassRegistry &); extern char &SIOptimizeExecMaskingID; +void initializeSIFixWWMLivenessPass(PassRegistry &); +extern char &SIFixWWMLivenessID; + +void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &); +extern char &AMDGPUSimplifyLibCallsID; + +void initializeAMDGPUUseNativeCallsPass(PassRegistry &); +extern char &AMDGPUUseNativeCallsID; + // Passes common to R600 and SI FunctionPass *createAMDGPUPromoteAlloca(); void initializeAMDGPUPromoteAllocaPass(PassRegistry&); extern char &AMDGPUPromoteAllocaID; Pass *createAMDGPUStructurizeCFGPass(); -FunctionPass *createAMDGPUISelDag(TargetMachine &TM, - CodeGenOpt::Level OptLevel); +FunctionPass *createAMDGPUISelDag( + TargetMachine *TM = nullptr, + CodeGenOpt::Level OptLevel = CodeGenOpt::Default); ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true); ModulePass *createAMDGPUOpenCLImageTypeLoweringPass(); FunctionPass *createAMDGPUAnnotateUniformValues(); @@ -113,8 +151,8 @@ ModulePass* createAMDGPUUnifyMetadataPass(); void initializeAMDGPUUnifyMetadataPass(PassRegistry&); extern char &AMDGPUUnifyMetadataID; -void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&); -extern char &SIFixControlFlowLiveIntervalsID; +void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry&); +extern char &SIOptimizeExecMaskingPreRAID; void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&); extern char &AMDGPUAnnotateUniformValuesPassID; @@ -125,6 +163,9 @@ extern char &AMDGPUCodeGenPrepareID; void initializeSIAnnotateControlFlowPass(PassRegistry&); extern char &SIAnnotateControlFlowPassID; +void initializeSIMemoryLegalizerPass(PassRegistry&); +extern char &SIMemoryLegalizerID; + void initializeSIDebuggerInsertNopsPass(PassRegistry&); extern char &SIDebuggerInsertNopsID; @@ -140,6 +181,15 @@ extern char &AMDGPUUnifyDivergentExitNodesID; ImmutablePass *createAMDGPUAAWrapperPass(); void initializeAMDGPUAAWrapperPassPass(PassRegistry&); +void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &); + +Pass *createAMDGPUFunctionInliningPass(); +void initializeAMDGPUInlinerPass(PassRegistry&); + +ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass(); +void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &); +extern char &AMDGPUOpenCLEnqueuedBlockLoweringID; + Target &getTheAMDGPUTarget(); Target &getTheGCNTarget(); @@ -167,39 +217,44 @@ struct AMDGPUAS { unsigned FLAT_ADDRESS; ///< Address space for flat memory. unsigned REGION_ADDRESS; ///< Address space for region memory. - // The maximum value for flat, generic, local, private, constant and region. - const static unsigned MAX_COMMON_ADDRESS = 5; - - const static unsigned GLOBAL_ADDRESS = 1; ///< Address space for global memory (RAT0, VTX0). - const static unsigned CONSTANT_ADDRESS = 2; ///< Address space for constant memory (VTX2) - const static unsigned LOCAL_ADDRESS = 3; ///< Address space for local memory. - const static unsigned PARAM_D_ADDRESS = 6; ///< Address space for direct addressible parameter memory (CONST0) - const static unsigned PARAM_I_ADDRESS = 7; ///< Address space for indirect addressible parameter memory (VTX1) - - // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on this - // order to be able to dynamically index a constant buffer, for example: - // - // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx - - const static unsigned CONSTANT_BUFFER_0 = 8; - const static unsigned CONSTANT_BUFFER_1 = 9; - const static unsigned CONSTANT_BUFFER_2 = 10; - const static unsigned CONSTANT_BUFFER_3 = 11; - const static unsigned CONSTANT_BUFFER_4 = 12; - const static unsigned CONSTANT_BUFFER_5 = 13; - const static unsigned CONSTANT_BUFFER_6 = 14; - const static unsigned CONSTANT_BUFFER_7 = 15; - const static unsigned CONSTANT_BUFFER_8 = 16; - const static unsigned CONSTANT_BUFFER_9 = 17; - const static unsigned CONSTANT_BUFFER_10 = 18; - const static unsigned CONSTANT_BUFFER_11 = 19; - const static unsigned CONSTANT_BUFFER_12 = 20; - const static unsigned CONSTANT_BUFFER_13 = 21; - const static unsigned CONSTANT_BUFFER_14 = 22; - const static unsigned CONSTANT_BUFFER_15 = 23; - - // Some places use this if the address space can't be determined. - const static unsigned UNKNOWN_ADDRESS_SPACE = ~0u; + enum : unsigned { + // The maximum value for flat, generic, local, private, constant and region. + MAX_COMMON_ADDRESS = 5, + + GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). + CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2) + LOCAL_ADDRESS = 3, ///< Address space for local memory. + /// Address space for direct addressible parameter memory (CONST0) + PARAM_D_ADDRESS = 6, + /// Address space for indirect addressible parameter memory (VTX1) + PARAM_I_ADDRESS = 7, + + // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on + // this order to be able to dynamically index a constant buffer, for + // example: + // + // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx + + CONSTANT_BUFFER_0 = 8, + CONSTANT_BUFFER_1 = 9, + CONSTANT_BUFFER_2 = 10, + CONSTANT_BUFFER_3 = 11, + CONSTANT_BUFFER_4 = 12, + CONSTANT_BUFFER_5 = 13, + CONSTANT_BUFFER_6 = 14, + CONSTANT_BUFFER_7 = 15, + CONSTANT_BUFFER_8 = 16, + CONSTANT_BUFFER_9 = 17, + CONSTANT_BUFFER_10 = 18, + CONSTANT_BUFFER_11 = 19, + CONSTANT_BUFFER_12 = 20, + CONSTANT_BUFFER_13 = 21, + CONSTANT_BUFFER_14 = 22, + CONSTANT_BUFFER_15 = 23, + + // Some places use this if the address space can't be determined. + UNKNOWN_ADDRESS_SPACE = ~0u, + }; }; namespace llvm { diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index f1d899c4d003..c02d0a131041 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -19,6 +19,12 @@ def FeatureFP64 : SubtargetFeature<"fp64", "Enable double precision operations" >; +def FeatureFMA : SubtargetFeature<"fmaf", + "FMA", + "true", + "Enable single precision FMA (not as fast as mul+add, but fused)" +>; + def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf", "FastFMAF32", "true", @@ -79,6 +85,12 @@ def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts", "Have scratch_* flat memory instructions" >; +def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts", + "AddNoCarryInsts", + "true", + "Have VALU add/sub instructions without carry out" +>; + def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access", "UnalignedBufferAccess", "true", @@ -103,6 +115,12 @@ def FeatureApertureRegs : SubtargetFeature<"aperture-regs", "Has Memory Aperture Base and Size Registers" >; +def FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts", + "HasMadMixInsts", + "true", + "Has v_mad_mix_f32, v_mad_mixlo_f16, v_mad_mixhi_f16 instructions" +>; + // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support // XNACK. The current default kernel driver setting is: // - graphics ring: XNACK disabled @@ -119,7 +137,7 @@ def FeatureXNACK : SubtargetFeature<"xnack", def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", "SGPRInitBug", "true", - "VI SGPR initilization bug requiring a fixed SGPR allocation size" + "VI SGPR initialization bug requiring a fixed SGPR allocation size" >; class SubtargetFeatureFetchLimit <string Value> : @@ -166,12 +184,6 @@ def FeatureGCN : SubtargetFeature<"gcn", "GCN or newer GPU" >; -def FeatureGCN1Encoding : SubtargetFeature<"gcn1-encoding", - "GCN1Encoding", - "true", - "Encoding format for SI and CI" ->; - def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding", "GCN3Encoding", "true", @@ -181,13 +193,13 @@ def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding", def FeatureCIInsts : SubtargetFeature<"ci-insts", "CIInsts", "true", - "Additional intstructions for CI+" + "Additional instructions for CI+" >; def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts", "GFX9Insts", "true", - "Additional intstructions for GFX9+" + "Additional instructions for GFX9+" >; def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime", @@ -274,6 +286,12 @@ def FeatureDPP : SubtargetFeature<"dpp", "Support DPP (Data Parallel Primitives) extension" >; +def FeatureIntClamp : SubtargetFeature<"int-clamp-insts", + "HasIntClamp", + "true", + "Support clamp for integer destination" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -334,6 +352,13 @@ def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>; def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>; def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>; +def FeatureEnableHugePrivateBuffer : SubtargetFeature< + "huge-private-buffer", + "EnableHugePrivateBuffer", + "true", + "Enable private/scratch buffer sizes greater than 128 GB" +>; + def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", "EnableVGPRSpilling", "true", @@ -402,6 +427,13 @@ def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature < "Hardware automatically inserts waitcnt before barrier" >; +def FeatureCodeObjectV3 : SubtargetFeature < + "code-object-v3", + "CodeObjectV3", + "true", + "Generate code object version 3" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", @@ -436,14 +468,14 @@ def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", [FeatureFP64, FeatureLocalMemorySize32768, - FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding, + FeatureWavefrontSize64, FeatureGCN, FeatureLDSBankCount32, FeatureMovrel] >; def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", [FeatureFP64, FeatureLocalMemorySize65536, FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace, - FeatureGCN1Encoding, FeatureCIInsts, FeatureMovrel] + FeatureCIInsts, FeatureMovrel] >; def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", @@ -452,7 +484,8 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel, FeatureScalarStores, FeatureInv2PiInlineImm, - FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP + FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP, + FeatureIntClamp ] >; @@ -462,9 +495,10 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9", FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode, - FeatureFastFMAF32, FeatureDPP, + FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, - FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts + FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, + FeatureAddNoCarryInsts ] >; @@ -506,6 +540,10 @@ def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3, [FeatureSeaIslands, FeatureLDSBankCount16]>; +def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4, + [FeatureSeaIslands, + FeatureLDSBankCount32]>; + def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0, [FeatureVolcanicIslands, FeatureLDSBankCount32, @@ -513,6 +551,8 @@ def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0, def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1, [FeatureVolcanicIslands, + FeatureFastFMAF32, + HalfRate64Ops, FeatureLDSBankCount32, FeatureXNACK]>; @@ -525,10 +565,6 @@ def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3, [FeatureVolcanicIslands, FeatureLDSBankCount32]>; -def FeatureISAVersion8_0_4 : SubtargetFeatureISAVersion <8,0,4, - [FeatureVolcanicIslands, - FeatureLDSBankCount32]>; - def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0, [FeatureVolcanicIslands, FeatureLDSBankCount16, @@ -536,21 +572,15 @@ def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0, def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0, [FeatureGFX9, - FeatureLDSBankCount32]>; - -def FeatureISAVersion9_0_1 : SubtargetFeatureISAVersion <9,0,1, - [FeatureGFX9, - FeatureLDSBankCount32, - FeatureXNACK]>; + FeatureMadMixInsts, + FeatureLDSBankCount32 + ]>; def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2, [FeatureGFX9, - FeatureLDSBankCount32]>; - -def FeatureISAVersion9_0_3 : SubtargetFeatureISAVersion <9,0,3, - [FeatureGFX9, - FeatureLDSBankCount32, - FeatureXNACK]>; + FeatureMadMixInsts, + FeatureLDSBankCount32 + ]>; //===----------------------------------------------------------------------===// // Debugger related subtarget features. @@ -660,7 +690,7 @@ def TruePredicate : Predicate<"true">; def isSICI : Predicate< "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" ->, AssemblerPredicate<"FeatureGCN1Encoding">; +>, AssemblerPredicate<"!FeatureGCN3Encoding">; def isVI : Predicate < "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, @@ -680,6 +710,23 @@ def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">, def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">, AssemblerPredicate<"FeatureFlatGlobalInsts">; +def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">, + AssemblerPredicate<"FeatureFlatScratchInsts">; +def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">, + AssemblerPredicate<"FeatureGFX9Insts">; + + +def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">; +def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">; + +def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">, + AssemblerPredicate<"FeatureGFX9Insts">; + +def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarryInsts()">, + AssemblerPredicate<"FeatureAddNoCarryInsts">; + +def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarryInsts()">, + AssemblerPredicate<"!FeatureAddNoCarryInsts">; def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">, AssemblerPredicate<"Feature16BitInsts">; @@ -695,22 +742,41 @@ def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">, def HasDPP : Predicate<"Subtarget->hasDPP()">, AssemblerPredicate<"FeatureDPP">; +def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">, + AssemblerPredicate<"FeatureIntClamp">; + +def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">, + AssemblerPredicate<"FeatureMadMixInsts">; + +def EnableLateCFGStructurize : Predicate< + "EnableLateStructurizeCFG">; + +// Exists to help track down where SubtargetPredicate isn't set rather +// than letting tablegen crash with an unhelpful error. +def InvalidPred : Predicate<"predicate not set on instruction or pattern">; + class PredicateControl { - Predicate SubtargetPredicate; + Predicate SubtargetPredicate = InvalidPred; Predicate SIAssemblerPredicate = isSICI; Predicate VIAssemblerPredicate = isVI; list<Predicate> AssemblerPredicates = []; Predicate AssemblerPredicate = TruePredicate; list<Predicate> OtherPredicates = []; - list<Predicate> Predicates = !listconcat([SubtargetPredicate, AssemblerPredicate], + list<Predicate> Predicates = !listconcat([SubtargetPredicate, + AssemblerPredicate], AssemblerPredicates, OtherPredicates); } +class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>, + PredicateControl; + + // Include AMDGPU TD files include "R600Schedule.td" +include "R600Processors.td" include "SISchedule.td" -include "Processors.td" +include "GCNProcessors.td" include "AMDGPUInstrInfo.td" include "AMDGPUIntrinsics.td" include "AMDGPURegisterInfo.td" diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp index faa424eb0a64..392b011e387c 100644 --- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp +++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -1,4 +1,4 @@ -//===- AMDGPUAliasAnalysis ---------------------------------------*- C++ -*-==// +//===- AMDGPUAliasAnalysis ------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +12,21 @@ #include "AMDGPUAliasAnalysis.h" #include "AMDGPU.h" +#include "llvm/ADT/Triple.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/CallingConv.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Module.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" #include "llvm/Pass.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include <cassert> using namespace llvm; @@ -26,6 +34,7 @@ using namespace llvm; // Register this pass... char AMDGPUAAWrapperPass::ID = 0; + INITIALIZE_PASS(AMDGPUAAWrapperPass, "amdgpu-aa", "AMDGPU Address space based Alias Analysis", false, true) @@ -120,8 +129,11 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc, switch (F->getCallingConv()) { default: return AAResultBase::pointsToConstantMemory(Loc, OrLocal); - case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_LS: + case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_ES: case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_VS: case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: case CallingConv::AMDGPU_KERNEL: diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h index 5f8ed9b1f9a3..645a38af753c 100644 --- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h +++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h @@ -1,4 +1,4 @@ -//===- AMDGPUAliasAnalysis ---------------------------------------*- C++ -*-==// +//===- AMDGPUAliasAnalysis --------------------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -10,17 +10,24 @@ /// This is the AMGPU address space based alias analysis pass. //===----------------------------------------------------------------------===// -#ifndef LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H -#define LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUALIASANALYSIS_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUALIASANALYSIS_H #include "AMDGPU.h" +#include "llvm/ADT/Triple.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" +#include <algorithm> +#include <memory> namespace llvm { +class DataLayout; +class MDNode; +class MemoryLocation; + /// A simple AA result that uses TBAA metadata to answer queries. class AMDGPUAAResult : public AAResultBase<AMDGPUAAResult> { friend AAResultBase<AMDGPUAAResult>; @@ -50,7 +57,9 @@ private: class ASAliasRulesTy { public: ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Arch_); + AliasResult getAliasResult(unsigned AS1, unsigned AS2) const; + private: Triple::ArchType Arch; AMDGPUAS AS; @@ -61,10 +70,11 @@ private: /// Analysis pass providing a never-invalidated alias analysis result. class AMDGPUAA : public AnalysisInfoMixin<AMDGPUAA> { friend AnalysisInfoMixin<AMDGPUAA>; + static char PassID; public: - typedef AMDGPUAAResult Result; + using Result = AMDGPUAAResult; AMDGPUAAResult run(Function &F, AnalysisManager<Function> &AM) { return AMDGPUAAResult(F.getParent()->getDataLayout(), @@ -91,12 +101,15 @@ public: Triple(M.getTargetTriple()))); return false; } + bool doFinalization(Module &M) override { Result.reset(); return false; } + void getAnalysisUsage(AnalysisUsage &AU) const override; }; -} -#endif // LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUALIASANALYSIS_H diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index 6f3742ed039b..c27425443abc 100644 --- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -21,6 +21,12 @@ using namespace llvm; namespace { +static cl::opt<bool> StressCalls( + "amdgpu-stress-function-calls", + cl::Hidden, + cl::desc("Force all functions to be noinline"), + cl::init(false)); + class AMDGPUAlwaysInline : public ModulePass { bool GlobalOpt; @@ -57,9 +63,13 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) { } } + auto NewAttr = StressCalls ? Attribute::NoInline : Attribute::AlwaysInline; + auto IncompatAttr + = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline; + for (Function &F : M) { if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() && - !F.hasFnAttribute(Attribute::NoInline)) + !F.hasFnAttribute(IncompatAttr)) FuncsToClone.push_back(&F); } @@ -71,8 +81,8 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) { } for (Function &F : M) { - if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) { - F.addFnAttr(Attribute::AlwaysInline); + if (F.hasLocalLinkage() && !F.hasFnAttribute(IncompatAttr)) { + F.addFnAttr(NewAttr); } } return false; diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index c68e5861ff25..ce17202f3414 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -1,4 +1,4 @@ -//===-- AMDGPUAnnotateKernelFeaturesPass.cpp ------------------------------===// +//===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===// // // The LLVM Compiler Infrastructure // @@ -14,13 +14,28 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "amdgpu-annotate-kernel-features" @@ -42,6 +57,7 @@ public: bool doInitialization(CallGraph &CG) override; bool runOnSCC(CallGraphSCC &SCC) override; + StringRef getPassName() const override { return "AMDGPU Annotate Kernel Features"; } @@ -58,7 +74,7 @@ public: AMDGPUAS AS); }; -} +} // end anonymous namespace char AMDGPUAnnotateKernelFeatures::ID = 0; @@ -156,8 +172,9 @@ static StringRef intrinsicToAttrName(Intrinsic::ID ID, case Intrinsic::amdgcn_dispatch_id: return "amdgpu-dispatch-id"; case Intrinsic::amdgcn_kernarg_segment_ptr: - case Intrinsic::amdgcn_implicitarg_ptr: return "amdgpu-kernarg-segment-ptr"; + case Intrinsic::amdgcn_implicitarg_ptr: + return "amdgpu-implicitarg-ptr"; case Intrinsic::amdgcn_queue_ptr: case Intrinsic::trap: case Intrinsic::debugtrap: @@ -190,7 +207,8 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee, { "amdgpu-work-group-id-z" }, { "amdgpu-dispatch-ptr" }, { "amdgpu-dispatch-id" }, - { "amdgpu-kernarg-segment-ptr" } + { "amdgpu-kernarg-segment-ptr" }, + { "amdgpu-implicitarg-ptr" } }; if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) @@ -292,7 +310,6 @@ bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { Changed |= addFeatureAttributes(*F); } - return Changed; } diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp new file mode 100644 index 000000000000..dcca3a2fab96 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -0,0 +1,131 @@ +//===----------------------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUArgumentUsageInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-argument-reg-usage-info" + +INITIALIZE_PASS(AMDGPUArgumentUsageInfo, DEBUG_TYPE, + "Argument Register Usage Information Storage", false, true) + +void ArgDescriptor::print(raw_ostream &OS, + const TargetRegisterInfo *TRI) const { + if (!isSet()) { + OS << "<not set>\n"; + return; + } + + if (isRegister()) + OS << "Reg " << printReg(getRegister(), TRI) << '\n'; + else + OS << "Stack offset " << getStackOffset() << '\n'; +} + +char AMDGPUArgumentUsageInfo::ID = 0; + +const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::ExternFunctionInfo{}; + +bool AMDGPUArgumentUsageInfo::doInitialization(Module &M) { + return false; +} + +bool AMDGPUArgumentUsageInfo::doFinalization(Module &M) { + ArgInfoMap.clear(); + return false; +} + +void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const { + for (const auto &FI : ArgInfoMap) { + OS << "Arguments for " << FI.first->getName() << '\n' + << " PrivateSegmentBuffer: " << FI.second.PrivateSegmentBuffer + << " DispatchPtr: " << FI.second.DispatchPtr + << " QueuePtr: " << FI.second.QueuePtr + << " KernargSegmentPtr: " << FI.second.KernargSegmentPtr + << " DispatchID: " << FI.second.DispatchID + << " FlatScratchInit: " << FI.second.FlatScratchInit + << " PrivateSegmentSize: " << FI.second.PrivateSegmentSize + << " GridWorkgroupCountX: " << FI.second.GridWorkGroupCountX + << " GridWorkgroupCountY: " << FI.second.GridWorkGroupCountY + << " GridWorkgroupCountZ: " << FI.second.GridWorkGroupCountZ + << " WorkGroupIDX: " << FI.second.WorkGroupIDX + << " WorkGroupIDY: " << FI.second.WorkGroupIDY + << " WorkGroupIDZ: " << FI.second.WorkGroupIDZ + << " WorkGroupInfo: " << FI.second.WorkGroupInfo + << " PrivateSegmentWaveByteOffset: " + << FI.second.PrivateSegmentWaveByteOffset + << " ImplicitBufferPtr: " << FI.second.ImplicitBufferPtr + << " ImplicitArgPtr: " << FI.second.ImplicitArgPtr + << " WorkItemIDX " << FI.second.WorkItemIDX + << " WorkItemIDY " << FI.second.WorkItemIDY + << " WorkItemIDZ " << FI.second.WorkItemIDZ + << '\n'; + } +} + +std::pair<const ArgDescriptor *, const TargetRegisterClass *> +AMDGPUFunctionArgInfo::getPreloadedValue( + AMDGPUFunctionArgInfo::PreloadedValue Value) const { + switch (Value) { + case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER: { + return std::make_pair( + PrivateSegmentBuffer ? &PrivateSegmentBuffer : nullptr, + &AMDGPU::SGPR_128RegClass); + } + case AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR: + return std::make_pair(ImplicitBufferPtr ? &ImplicitBufferPtr : nullptr, + &AMDGPU::SGPR_64RegClass); + case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: + return std::make_pair(WorkGroupIDX ? &WorkGroupIDX : nullptr, + &AMDGPU::SGPR_32RegClass); + + case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y: + return std::make_pair(WorkGroupIDY ? &WorkGroupIDY : nullptr, + &AMDGPU::SGPR_32RegClass); + case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: + return std::make_pair(WorkGroupIDZ ? &WorkGroupIDZ : nullptr, + &AMDGPU::SGPR_32RegClass); + case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: + return std::make_pair( + PrivateSegmentWaveByteOffset ? &PrivateSegmentWaveByteOffset : nullptr, + &AMDGPU::SGPR_32RegClass); + case AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR: + return std::make_pair(KernargSegmentPtr ? &KernargSegmentPtr : nullptr, + &AMDGPU::SGPR_64RegClass); + case AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR: + return std::make_pair(ImplicitArgPtr ? &ImplicitArgPtr : nullptr, + &AMDGPU::SGPR_64RegClass); + case AMDGPUFunctionArgInfo::DISPATCH_ID: + return std::make_pair(DispatchID ? &DispatchID : nullptr, + &AMDGPU::SGPR_64RegClass); + case AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT: + return std::make_pair(FlatScratchInit ? &FlatScratchInit : nullptr, + &AMDGPU::SGPR_64RegClass); + case AMDGPUFunctionArgInfo::DISPATCH_PTR: + return std::make_pair(DispatchPtr ? &DispatchPtr : nullptr, + &AMDGPU::SGPR_64RegClass); + case AMDGPUFunctionArgInfo::QUEUE_PTR: + return std::make_pair(QueuePtr ? &QueuePtr : nullptr, + &AMDGPU::SGPR_64RegClass); + case AMDGPUFunctionArgInfo::WORKITEM_ID_X: + return std::make_pair(WorkItemIDX ? &WorkItemIDX : nullptr, + &AMDGPU::VGPR_32RegClass); + case AMDGPUFunctionArgInfo::WORKITEM_ID_Y: + return std::make_pair(WorkItemIDY ? &WorkItemIDY : nullptr, + &AMDGPU::VGPR_32RegClass); + case AMDGPUFunctionArgInfo::WORKITEM_ID_Z: + return std::make_pair(WorkItemIDZ ? &WorkItemIDZ : nullptr, + &AMDGPU::VGPR_32RegClass); + } + llvm_unreachable("unexpected preloaded value type"); +} diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h new file mode 100644 index 000000000000..bf9635549a8c --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -0,0 +1,177 @@ +//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/IR/Function.h" +#include "llvm/Pass.h" + +namespace llvm { + +class Function; +class raw_ostream; +class SISubtarget; +class TargetMachine; +class TargetRegisterClass; +class TargetRegisterInfo; + +struct ArgDescriptor { +private: + friend struct AMDGPUFunctionArgInfo; + friend class AMDGPUArgumentUsageInfo; + + union { + unsigned Register; + unsigned StackOffset; + }; + + bool IsStack : 1; + bool IsSet : 1; + + ArgDescriptor(unsigned Val = 0, bool IsStack = false, bool IsSet = false) + : Register(Val), IsStack(IsStack), IsSet(IsSet) {} +public: + static ArgDescriptor createRegister(unsigned Reg) { + return ArgDescriptor(Reg, false, true); + } + + static ArgDescriptor createStack(unsigned Reg) { + return ArgDescriptor(Reg, true, true); + } + + bool isSet() const { + return IsSet; + } + + explicit operator bool() const { + return isSet(); + } + + bool isRegister() const { + return !IsStack; + } + + unsigned getRegister() const { + assert(!IsStack); + return Register; + } + + unsigned getStackOffset() const { + assert(IsStack); + return StackOffset; + } + + void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const; +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) { + Arg.print(OS); + return OS; +} + +struct AMDGPUFunctionArgInfo { + enum PreloadedValue { + // SGPRS: + PRIVATE_SEGMENT_BUFFER = 0, + DISPATCH_PTR = 1, + QUEUE_PTR = 2, + KERNARG_SEGMENT_PTR = 3, + DISPATCH_ID = 4, + FLAT_SCRATCH_INIT = 5, + WORKGROUP_ID_X = 10, + WORKGROUP_ID_Y = 11, + WORKGROUP_ID_Z = 12, + PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, + IMPLICIT_BUFFER_PTR = 15, + IMPLICIT_ARG_PTR = 16, + + // VGPRS: + WORKITEM_ID_X = 17, + WORKITEM_ID_Y = 18, + WORKITEM_ID_Z = 19, + FIRST_VGPR_VALUE = WORKITEM_ID_X + }; + + // Kernel input registers setup for the HSA ABI in allocation order. + + // User SGPRs in kernels + // XXX - Can these require argument spills? + ArgDescriptor PrivateSegmentBuffer; + ArgDescriptor DispatchPtr; + ArgDescriptor QueuePtr; + ArgDescriptor KernargSegmentPtr; + ArgDescriptor DispatchID; + ArgDescriptor FlatScratchInit; + ArgDescriptor PrivateSegmentSize; + ArgDescriptor GridWorkGroupCountX; + ArgDescriptor GridWorkGroupCountY; + ArgDescriptor GridWorkGroupCountZ; + + // System SGPRs in kernels. + ArgDescriptor WorkGroupIDX; + ArgDescriptor WorkGroupIDY; + ArgDescriptor WorkGroupIDZ; + ArgDescriptor WorkGroupInfo; + ArgDescriptor PrivateSegmentWaveByteOffset; + + // Pointer with offset from kernargsegmentptr to where special ABI arguments + // are passed to callable functions. + ArgDescriptor ImplicitArgPtr; + + // Input registers for non-HSA ABI + ArgDescriptor ImplicitBufferPtr = 0; + + // VGPRs inputs. These are always v0, v1 and v2 for entry functions. + ArgDescriptor WorkItemIDX; + ArgDescriptor WorkItemIDY; + ArgDescriptor WorkItemIDZ; + + std::pair<const ArgDescriptor *, const TargetRegisterClass *> + getPreloadedValue(PreloadedValue Value) const; +}; + +class AMDGPUArgumentUsageInfo : public ImmutablePass { +private: + static const AMDGPUFunctionArgInfo ExternFunctionInfo; + DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap; + +public: + static char ID; + + AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + + bool doInitialization(Module &M) override; + bool doFinalization(Module &M) override; + + void print(raw_ostream &OS, const Module *M = nullptr) const override; + + void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) { + ArgInfoMap[&F] = ArgInfo; + } + + const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const { + auto I = ArgInfoMap.find(&F); + if (I == ArgInfoMap.end()) { + assert(F.isDeclaration()); + return ExternFunctionInfo; + } + + return I->second; + } +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 2247814cfe55..bb628b8c558f 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -32,15 +32,17 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/TargetLoweringObjectFile.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/Support/AMDGPUMetadata.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/TargetRegistry.h" -#include "llvm/Target/TargetLoweringObjectFile.h" using namespace llvm; +using namespace llvm::AMDGPU; // TODO: This should get the default rounding mode from the kernel. We just set // the default here, but this could change if the OpenCL rounding mode pragmas @@ -105,28 +107,71 @@ const MCSubtargetInfo* AMDGPUAsmPrinter::getSTI() const { return TM.getMCSubtargetInfo(); } -AMDGPUTargetStreamer& AMDGPUAsmPrinter::getTargetStreamer() const { - return static_cast<AMDGPUTargetStreamer&>(*OutStreamer->getTargetStreamer()); +AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { + if (!OutStreamer) + return nullptr; + return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer()); } void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { - if (TM.getTargetTriple().getOS() != Triple::AMDHSA) + if (TM.getTargetTriple().getArch() != Triple::amdgcn) + return; + + if (TM.getTargetTriple().getOS() != Triple::AMDHSA && + TM.getTargetTriple().getOS() != Triple::AMDPAL) + return; + + if (TM.getTargetTriple().getOS() == Triple::AMDHSA) + HSAMetadataStream.begin(M); + + if (TM.getTargetTriple().getOS() == Triple::AMDPAL) + readPALMetadata(M); + + // Deprecated notes are not emitted for code object v3. + if (IsaInfo::hasCodeObjectV3(getSTI()->getFeatureBits())) return; - AMDGPU::IsaInfo::IsaVersion ISA = - AMDGPU::IsaInfo::getIsaVersion(getSTI()->getFeatureBits()); + // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2. + if (TM.getTargetTriple().getOS() == Triple::AMDHSA) + getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1); - getTargetStreamer().EmitDirectiveHSACodeObjectVersion(2, 1); - getTargetStreamer().EmitDirectiveHSACodeObjectISA( + // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2. + IsaInfo::IsaVersion ISA = IsaInfo::getIsaVersion(getSTI()->getFeatureBits()); + getTargetStreamer()->EmitDirectiveHSACodeObjectISA( ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU"); - getTargetStreamer().EmitStartOfCodeObjectMetadata(M); } void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { - if (TM.getTargetTriple().getOS() != Triple::AMDHSA) + if (TM.getTargetTriple().getArch() != Triple::amdgcn) + return; + + // Following code requires TargetStreamer to be present. + if (!getTargetStreamer()) return; - getTargetStreamer().EmitEndOfCodeObjectMetadata(); + // Emit ISA Version (NT_AMD_AMDGPU_ISA). + std::string ISAVersionString; + raw_string_ostream ISAVersionStream(ISAVersionString); + IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream); + getTargetStreamer()->EmitISAVersion(ISAVersionStream.str()); + + // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA). + if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { + HSAMetadataStream.end(); + getTargetStreamer()->EmitHSAMetadata(HSAMetadataStream.getHSAMetadata()); + } + + // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA). + if (TM.getTargetTriple().getOS() == Triple::AMDPAL) { + // Copy the PAL metadata from the map where we collected it into a vector, + // then write it as a .note. + PALMD::Metadata PALMetadataVector; + for (auto i : PALMetadataMap) { + PALMetadataVector.push_back(i.first); + PALMetadataVector.push_back(i.second); + } + getTargetStreamer()->EmitPALMetadata(PALMetadataVector); + } } bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( @@ -154,13 +199,15 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() { getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF); OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); - getTargetStreamer().EmitAMDKernelCodeT(KernelCode); + getTargetStreamer()->EmitAMDKernelCodeT(KernelCode); } if (TM.getTargetTriple().getOS() != Triple::AMDHSA) return; - getTargetStreamer().EmitKernelCodeObjectMetadata(*MF->getFunction(), - KernelCode); + + HSAMetadataStream.emitKernel(MF->getFunction(), + getHSACodeProps(*MF, CurrentProgramInfo), + getHSADebugProps(*MF, CurrentProgramInfo)); } void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { @@ -168,18 +215,38 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(*MF)) { SmallString<128> SymbolName; - getNameWithPrefix(SymbolName, MF->getFunction()), - getTargetStreamer().EmitAMDGPUSymbolType( + getNameWithPrefix(SymbolName, &MF->getFunction()), + getTargetStreamer()->EmitAMDGPUSymbolType( SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); } + const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>(); + if (STI.dumpCode()) { + // Disassemble function name label to text. + DisasmLines.push_back(MF->getName().str() + ":"); + DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); + HexLines.push_back(""); + } AsmPrinter::EmitFunctionEntryLabel(); } +void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const { + const AMDGPUSubtarget &STI = MBB.getParent()->getSubtarget<AMDGPUSubtarget>(); + if (STI.dumpCode() && !isBlockOnlyReachableByFallthrough(&MBB)) { + // Write a line for the basic block label if it is not only fallthrough. + DisasmLines.push_back( + (Twine("BB") + Twine(getFunctionNumber()) + + "_" + Twine(MBB.getNumber()) + ":").str()); + DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); + HexLines.push_back(""); + } + AsmPrinter::EmitBasicBlockStart(MBB); +} + void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { // Group segment variables aren't emitted in HSA. - if (AMDGPU::isGroupSegment(GV, AMDGPUASI)) + if (AMDGPU::isGroupSegment(GV)) return; AsmPrinter::EmitGlobalVariable(GV); @@ -190,11 +257,32 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) { return AsmPrinter::doFinalization(M); } +// For the amdpal OS type, read the amdgpu.pal.metadata supplied by the +// frontend into our PALMetadataMap, ready for per-function modification. It +// is a NamedMD containing an MDTuple containing a number of MDNodes each of +// which is an integer value, and each two integer values forms a key=value +// pair that we store as PALMetadataMap[key]=value in the map. +void AMDGPUAsmPrinter::readPALMetadata(Module &M) { + auto NamedMD = M.getNamedMetadata("amdgpu.pal.metadata"); + if (!NamedMD || !NamedMD->getNumOperands()) + return; + auto Tuple = dyn_cast<MDTuple>(NamedMD->getOperand(0)); + if (!Tuple) + return; + for (unsigned I = 0, E = Tuple->getNumOperands() & -2; I != E; I += 2) { + auto Key = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I)); + auto Val = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I + 1)); + if (!Key || !Val) + continue; + PALMetadataMap[Key->getZExtValue()] = Val->getZExtValue(); + } +} + // Print comments that apply to both callable functions and entry points. void AMDGPUAsmPrinter::emitCommonFunctionComments( uint32_t NumVGPR, uint32_t NumSGPR, - uint32_t ScratchSize, + uint64_t ScratchSize, uint64_t CodeSize) { OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false); @@ -226,12 +314,14 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { getSIProgramInfo(CurrentProgramInfo, MF); } else { auto I = CallGraphResourceInfo.insert( - std::make_pair(MF.getFunction(), SIFunctionResourceInfo())); + std::make_pair(&MF.getFunction(), SIFunctionResourceInfo())); SIFunctionResourceInfo &Info = I.first->second; assert(I.second && "should only be called once per function"); Info = analyzeResourceUsage(MF); } + if (STM.isAmdPalOS()) + EmitPALMetadata(MF, CurrentProgramInfo); if (!STM.isAmdHsaOS()) { EmitProgramInfoSI(MF, CurrentProgramInfo); } @@ -253,7 +343,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { if (!MFI->isEntryFunction()) { OutStreamer->emitRawComment(" Function info:", false); - SIFunctionResourceInfo &Info = CallGraphResourceInfo[MF.getFunction()]; + SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()]; emitCommonFunctionComments( Info.NumVGPR, Info.getTotalNumSGPRs(MF.getSubtarget<SISubtarget>()), @@ -336,8 +426,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0)); for (size_t i = 0; i < DisasmLines.size(); ++i) { - std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' '); - Comment += " ; " + HexLines[i] + "\n"; + std::string Comment = "\n"; + if (!HexLines[i].empty()) { + Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' '); + Comment += " ; " + HexLines[i] + "\n"; + } OutStreamer->EmitBytes(StringRef(DisasmLines[i])); OutStreamer->EmitBytes(StringRef(Comment)); @@ -376,7 +469,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { unsigned RsrcReg; if (STM.getGeneration() >= R600Subtarget::EVERGREEN) { // Evergreen / Northern Islands - switch (MF.getFunction()->getCallingConv()) { + switch (MF.getFunction().getCallingConv()) { default: LLVM_FALLTHROUGH; case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; @@ -385,7 +478,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { } } else { // R600 / R700 - switch (MF.getFunction()->getCallingConv()) { + switch (MF.getFunction().getCallingConv()) { default: LLVM_FALLTHROUGH; case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH; case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH; @@ -400,7 +493,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); - if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { + if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4); } @@ -500,29 +593,184 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( // If there are no calls, MachineRegisterInfo can tell us the used register // count easily. + // A tail call isn't considered a call for MachineFrameInfo's purposes. + if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { + MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestVGPRReg = Reg; + break; + } + } - MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestVGPRReg = Reg; - break; + MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestSGPRReg = Reg; + break; + } } + + // We found the maximum register index. They start at 0, so add one to get the + // number of registers. + Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 : + TRI.getHWRegIndex(HighestVGPRReg) + 1; + Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 : + TRI.getHWRegIndex(HighestSGPRReg) + 1; + + return Info; } - MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestSGPRReg = Reg; - break; + int32_t MaxVGPR = -1; + int32_t MaxSGPR = -1; + uint64_t CalleeFrameSize = 0; + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + // TODO: Check regmasks? Do they occur anywhere except calls? + for (const MachineOperand &MO : MI.operands()) { + unsigned Width = 0; + bool IsSGPR = false; + + if (!MO.isReg()) + continue; + + unsigned Reg = MO.getReg(); + switch (Reg) { + case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: + case AMDGPU::SCC: + case AMDGPU::M0: + case AMDGPU::SRC_SHARED_BASE: + case AMDGPU::SRC_SHARED_LIMIT: + case AMDGPU::SRC_PRIVATE_BASE: + case AMDGPU::SRC_PRIVATE_LIMIT: + continue; + + case AMDGPU::NoRegister: + assert(MI.isDebugValue()); + continue; + + case AMDGPU::VCC: + case AMDGPU::VCC_LO: + case AMDGPU::VCC_HI: + Info.UsesVCC = true; + continue; + + case AMDGPU::FLAT_SCR: + case AMDGPU::FLAT_SCR_LO: + case AMDGPU::FLAT_SCR_HI: + continue; + + case AMDGPU::TBA: + case AMDGPU::TBA_LO: + case AMDGPU::TBA_HI: + case AMDGPU::TMA: + case AMDGPU::TMA_LO: + case AMDGPU::TMA_HI: + llvm_unreachable("trap handler registers should not be used"); + + default: + break; + } + + if (AMDGPU::SReg_32RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 1; + } else if (AMDGPU::VGPR_32RegClass.contains(Reg)) { + IsSGPR = false; + Width = 1; + } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 2; + } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { + IsSGPR = false; + Width = 2; + } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { + IsSGPR = false; + Width = 3; + } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { + IsSGPR = true; + Width = 4; + } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { + IsSGPR = false; + Width = 4; + } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { + IsSGPR = true; + Width = 8; + } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { + IsSGPR = false; + Width = 8; + } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { + IsSGPR = true; + Width = 16; + } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { + IsSGPR = false; + Width = 16; + } else { + llvm_unreachable("Unknown register class"); + } + unsigned HWReg = TRI.getHWRegIndex(Reg); + int MaxUsed = HWReg + Width - 1; + if (IsSGPR) { + MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; + } else { + MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; + } + } + + if (MI.isCall()) { + // Pseudo used just to encode the underlying global. Is there a better + // way to track this? + + const MachineOperand *CalleeOp + = TII->getNamedOperand(MI, AMDGPU::OpName::callee); + const Function *Callee = cast<Function>(CalleeOp->getGlobal()); + if (Callee->isDeclaration()) { + // If this is a call to an external function, we can't do much. Make + // conservative guesses. + + // 48 SGPRs - vcc, - flat_scr, -xnack + int MaxSGPRGuess = 47 - getNumExtraSGPRs(ST, true, + ST.hasFlatAddressSpace()); + MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); + MaxVGPR = std::max(MaxVGPR, 23); + + CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384)); + Info.UsesVCC = true; + Info.UsesFlatScratch = ST.hasFlatAddressSpace(); + Info.HasDynamicallySizedStack = true; + } else { + // We force CodeGen to run in SCC order, so the callee's register + // usage etc. should be the cumulative usage of all callees. + auto I = CallGraphResourceInfo.find(Callee); + assert(I != CallGraphResourceInfo.end() && + "callee should have been handled before caller"); + + MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); + MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); + CalleeFrameSize + = std::max(I->second.PrivateSegmentSize, CalleeFrameSize); + Info.UsesVCC |= I->second.UsesVCC; + Info.UsesFlatScratch |= I->second.UsesFlatScratch; + Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; + Info.HasRecursion |= I->second.HasRecursion; + } + + if (!Callee->doesNotRecurse()) + Info.HasRecursion = true; + } } } - // We found the maximum register index. They start at 0, so add one to get the - // number of registers. - Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestVGPRReg) + 1; - Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestSGPRReg) + 1; + Info.NumExplicitSGPR = MaxSGPR + 1; + Info.NumVGPR = MaxVGPR + 1; + Info.PrivateSegmentSize += CalleeFrameSize; return Info; } @@ -538,6 +786,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.FlatUsed = Info.UsesFlatScratch; ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion; + if (!isUInt<32>(ProgInfo.ScratchSize)) { + DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), + ProgInfo.ScratchSize, DS_Error); + MF.getFunction().getContext().diagnose(DiagStackSize); + } + const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); const SIInstrInfo *TII = STM.getInstrInfo(); @@ -554,8 +808,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { // This can happen due to a compiler bug or when using inline asm. - LLVMContext &Ctx = MF.getFunction()->getContext(); - DiagnosticInfoResourceLimit Diag(*MF.getFunction(), + LLVMContext &Ctx = MF.getFunction().getContext(); + DiagnosticInfoResourceLimit Diag(MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR, DS_Error, DK_ResourceLimit, @@ -582,8 +836,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { // This can happen due to a compiler bug or when using inline asm to use // the registers which are usually reserved for vcc etc. - LLVMContext &Ctx = MF.getFunction()->getContext(); - DiagnosticInfoResourceLimit Diag(*MF.getFunction(), + LLVMContext &Ctx = MF.getFunction().getContext(); + DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers", ProgInfo.NumSGPR, DS_Error, DK_ResourceLimit, @@ -602,15 +856,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, } if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) { - LLVMContext &Ctx = MF.getFunction()->getContext(); - DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs", + LLVMContext &Ctx = MF.getFunction().getContext(); + DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs", MFI->getNumUserSGPRs(), DS_Error); Ctx.diagnose(Diag); } if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) { - LLVMContext &Ctx = MF.getFunction()->getContext(); - DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory", + LLVMContext &Ctx = MF.getFunction().getContext(); + DiagnosticInfoResourceLimit Diag(MF.getFunction(), "local memory", MFI->getLDSSize(), DS_Error); Ctx.diagnose(Diag); } @@ -710,10 +964,12 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) { switch (CallConv) { default: LLVM_FALLTHROUGH; case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; + case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS; case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS; + case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES; case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; - case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; + case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; } } @@ -721,9 +977,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) { const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv()); + unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv()); - if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { + if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4); @@ -740,19 +996,24 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer->EmitIntValue(RsrcReg, 4); OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4); - if (STM.isVGPRSpillingEnabled(*MF.getFunction())) { + unsigned Rsrc2Val = 0; + if (STM.isVGPRSpillingEnabled(MF.getFunction())) { OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); + if (TM.getTargetTriple().getOS() == Triple::AMDPAL) + Rsrc2Val = S_00B84C_SCRATCH_EN(CurrentProgramInfo.ScratchBlocks > 0); + } + if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { + OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); + OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4); + OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); + OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); + Rsrc2Val |= S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks); + } + if (Rsrc2Val) { + OutStreamer->EmitIntValue(RsrcReg + 4 /*rsrc2*/, 4); + OutStreamer->EmitIntValue(Rsrc2Val, 4); } - } - - if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) { - OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); - OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4); - OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); - OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4); - OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); - OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); } OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4); @@ -761,6 +1022,75 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4); } +// This is the equivalent of EmitProgramInfoSI above, but for when the OS type +// is AMDPAL. It stores each compute/SPI register setting and other PAL +// metadata items into the PALMetadataMap, combining with any provided by the +// frontend as LLVM metadata. Once all functions are written, PALMetadataMap is +// then written as a single block in the .note section. +void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, + const SIProgramInfo &CurrentProgramInfo) { + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + // Given the calling convention, calculate the register number for rsrc1. In + // principle the register number could change in future hardware, but we know + // it is the same for gfx6-9 (except that LS and ES don't exist on gfx9), so + // we can use the same fixed value that .AMDGPU.config has for Mesa. Note + // that we use a register number rather than a byte offset, so we need to + // divide by 4. + unsigned Rsrc1Reg = getRsrcReg(MF.getFunction().getCallingConv()) / 4; + unsigned Rsrc2Reg = Rsrc1Reg + 1; + // Also calculate the PAL metadata key for *S_SCRATCH_SIZE. It can be used + // with a constant offset to access any non-register shader-specific PAL + // metadata key. + unsigned ScratchSizeKey = PALMD::Key::CS_SCRATCH_SIZE; + switch (MF.getFunction().getCallingConv()) { + case CallingConv::AMDGPU_PS: + ScratchSizeKey = PALMD::Key::PS_SCRATCH_SIZE; + break; + case CallingConv::AMDGPU_VS: + ScratchSizeKey = PALMD::Key::VS_SCRATCH_SIZE; + break; + case CallingConv::AMDGPU_GS: + ScratchSizeKey = PALMD::Key::GS_SCRATCH_SIZE; + break; + case CallingConv::AMDGPU_ES: + ScratchSizeKey = PALMD::Key::ES_SCRATCH_SIZE; + break; + case CallingConv::AMDGPU_HS: + ScratchSizeKey = PALMD::Key::HS_SCRATCH_SIZE; + break; + case CallingConv::AMDGPU_LS: + ScratchSizeKey = PALMD::Key::LS_SCRATCH_SIZE; + break; + } + unsigned NumUsedVgprsKey = ScratchSizeKey + + PALMD::Key::VS_NUM_USED_VGPRS - PALMD::Key::VS_SCRATCH_SIZE; + unsigned NumUsedSgprsKey = ScratchSizeKey + + PALMD::Key::VS_NUM_USED_SGPRS - PALMD::Key::VS_SCRATCH_SIZE; + PALMetadataMap[NumUsedVgprsKey] = CurrentProgramInfo.NumVGPRsForWavesPerEU; + PALMetadataMap[NumUsedSgprsKey] = CurrentProgramInfo.NumSGPRsForWavesPerEU; + if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { + PALMetadataMap[Rsrc1Reg] |= CurrentProgramInfo.ComputePGMRSrc1; + PALMetadataMap[Rsrc2Reg] |= CurrentProgramInfo.ComputePGMRSrc2; + // ScratchSize is in bytes, 16 aligned. + PALMetadataMap[ScratchSizeKey] |= + alignTo(CurrentProgramInfo.ScratchSize, 16); + } else { + PALMetadataMap[Rsrc1Reg] |= S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | + S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks); + if (CurrentProgramInfo.ScratchBlocks > 0) + PALMetadataMap[Rsrc2Reg] |= S_00B84C_SCRATCH_EN(1); + // ScratchSize is in bytes, 16 aligned. + PALMetadataMap[ScratchSizeKey] |= + alignTo(CurrentProgramInfo.ScratchSize, 16); + } + if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { + PALMetadataMap[Rsrc2Reg] |= + S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks); + PALMetadataMap[R_0286CC_SPI_PS_INPUT_ENA / 4] |= MFI->getPSInputEnable(); + PALMetadataMap[R_0286D0_SPI_PS_INPUT_ADDR / 4] |= MFI->getPSInputAddr(); + } +} + // This is supposed to be log2(Size) static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { switch (Size) { @@ -862,23 +1192,81 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, } } +AMDGPU::HSAMD::Kernel::CodeProps::Metadata AMDGPUAsmPrinter::getHSACodeProps( + const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const { + const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); + const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + HSAMD::Kernel::CodeProps::Metadata HSACodeProps; + + HSACodeProps.mKernargSegmentSize = + STM.getKernArgSegmentSize(MF, MFI.getABIArgOffset()); + HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize; + HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize; + HSACodeProps.mKernargSegmentAlign = + std::max(uint32_t(4), MFI.getMaxKernArgAlign()); + HSACodeProps.mWavefrontSize = STM.getWavefrontSize(); + HSACodeProps.mNumSGPRs = CurrentProgramInfo.NumSGPR; + HSACodeProps.mNumVGPRs = CurrentProgramInfo.NumVGPR; + HSACodeProps.mMaxFlatWorkGroupSize = MFI.getMaxFlatWorkGroupSize(); + HSACodeProps.mIsDynamicCallStack = ProgramInfo.DynamicCallStack; + HSACodeProps.mIsXNACKEnabled = STM.isXNACKEnabled(); + HSACodeProps.mNumSpilledSGPRs = MFI.getNumSpilledSGPRs(); + HSACodeProps.mNumSpilledVGPRs = MFI.getNumSpilledVGPRs(); + + return HSACodeProps; +} + +AMDGPU::HSAMD::Kernel::DebugProps::Metadata AMDGPUAsmPrinter::getHSADebugProps( + const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const { + const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); + HSAMD::Kernel::DebugProps::Metadata HSADebugProps; + + if (!STM.debuggerSupported()) + return HSADebugProps; + + HSADebugProps.mDebuggerABIVersion.push_back(1); + HSADebugProps.mDebuggerABIVersion.push_back(0); + HSADebugProps.mReservedNumVGPRs = ProgramInfo.ReservedVGPRCount; + HSADebugProps.mReservedFirstVGPR = ProgramInfo.ReservedVGPRFirst; + + if (STM.debuggerEmitPrologue()) { + HSADebugProps.mPrivateSegmentBufferSGPR = + ProgramInfo.DebuggerPrivateSegmentBufferSGPR; + HSADebugProps.mWavefrontPrivateSegmentOffsetSGPR = + ProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; + } + + return HSADebugProps; +} + bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { + // First try the generic code, which knows about modifiers like 'c' and 'n'. + if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O)) + return false; + if (ExtraCode && ExtraCode[0]) { if (ExtraCode[1] != 0) return true; // Unknown modifier. switch (ExtraCode[0]) { - default: - // See if this is a generic print operand - return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); case 'r': break; + default: + return true; } } - AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O, - *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo()); - return false; + // TODO: Should be able to support other operand types like globals. + const MachineOperand &MO = MI->getOperand(OpNo); + if (MO.isReg()) { + AMDGPUInstPrinter::printRegOperand(MO.getReg(), O, + *MF->getSubtarget().getRegisterInfo()); + return false; + } + + return true; } diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 0a58ce06704d..51d48a0c7320 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -17,6 +17,7 @@ #include "AMDGPU.h" #include "AMDKernelCodeT.h" +#include "MCTargetDesc/AMDGPUHSAMetadataStreamer.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/AsmPrinter.h" #include <cstddef> @@ -40,7 +41,7 @@ private: // the end are tracked separately. int32_t NumVGPR = 0; int32_t NumExplicitSGPR = 0; - uint32_t PrivateSegmentSize = 0; + uint64_t PrivateSegmentSize = 0; bool UsesVCC = false; bool UsesFlatScratch = false; bool HasDynamicallySizedStack = false; @@ -60,7 +61,7 @@ private: uint32_t DX10Clamp = 0; uint32_t DebugMode = 0; uint32_t IEEEMode = 0; - uint32_t ScratchSize = 0; + uint64_t ScratchSize = 0; uint64_t ComputePGMRSrc1 = 0; @@ -113,9 +114,13 @@ private: SIProgramInfo CurrentProgramInfo; DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo; + AMDGPU::HSAMD::MetadataStreamer HSAMetadataStream; + std::map<uint32_t, uint32_t> PALMetadataMap; + uint64_t getFunctionCodeSize(const MachineFunction &MF) const; SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const; + void readPALMetadata(Module &M); void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF); void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo, const MachineFunction &MF) const; @@ -123,13 +128,23 @@ private: unsigned &NumSGPR, unsigned &NumVGPR) const; + AMDGPU::HSAMD::Kernel::CodeProps::Metadata getHSACodeProps( + const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const; + AMDGPU::HSAMD::Kernel::DebugProps::Metadata getHSADebugProps( + const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const; + /// \brief Emit register usage information so that the GPU driver /// can correctly setup the GPU state. void EmitProgramInfoR600(const MachineFunction &MF); - void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo); + void EmitProgramInfoSI(const MachineFunction &MF, + const SIProgramInfo &KernelInfo); + void EmitPALMetadata(const MachineFunction &MF, + const SIProgramInfo &KernelInfo); void emitCommonFunctionComments(uint32_t NumVGPR, uint32_t NumSGPR, - uint32_t ScratchSize, + uint64_t ScratchSize, uint64_t CodeSize); public: @@ -140,7 +155,7 @@ public: const MCSubtargetInfo* getSTI() const; - AMDGPUTargetStreamer& getTargetStreamer() const; + AMDGPUTargetStreamer* getTargetStreamer() const; bool doFinalization(Module &M) override; bool runOnMachineFunction(MachineFunction &MF) override; @@ -166,6 +181,8 @@ public: void EmitFunctionEntryLabel() override; + void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override; + void EmitGlobalVariable(const GlobalVariable *GV) override; void EmitStartOfAsmFile(Module &M) override; @@ -180,8 +197,8 @@ public: raw_ostream &O) override; protected: - std::vector<std::string> DisasmLines, HexLines; - size_t DisasmLineMaxLen; + mutable std::vector<std::string> DisasmLines, HexLines; + mutable size_t DisasmLineMaxLen; AMDGPUAS AMDGPUASI; }; diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 515cc07dd449..5a9138731934 100644 --- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -26,10 +26,6 @@ using namespace llvm; -#ifndef LLVM_BUILD_GLOBAL_ISEL -#error "This shouldn't be built without GISel" -#endif - AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) : CallLowering(&TLI), AMDGPUASI(TLI.getAMDGPUAS()) { } @@ -45,15 +41,15 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, unsigned Offset) const { MachineFunction &MF = MIRBuilder.getMF(); - const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); MachineRegisterInfo &MRI = MF.getRegInfo(); - const Function &F = *MF.getFunction(); + const Function &F = MF.getFunction(); const DataLayout &DL = F.getParent()->getDataLayout(); PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS); LLT PtrType = getLLTForType(*PtrTy, DL); unsigned DstReg = MRI.createGenericVirtualRegister(PtrType); unsigned KernArgSegmentPtr = - TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); unsigned KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); unsigned OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); @@ -68,7 +64,7 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy, unsigned Offset, unsigned DstReg) const { MachineFunction &MF = MIRBuilder.getMF(); - const Function &F = *MF.getFunction(); + const Function &F = MF.getFunction(); const DataLayout &DL = F.getParent()->getDataLayout(); PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); @@ -144,18 +140,38 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, Function::const_arg_iterator CurOrigArg = F.arg_begin(); const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>(); for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) { - MVT ValVT = TLI.getValueType(DL, CurOrigArg->getType()).getSimpleVT(); + EVT ValEVT = TLI.getValueType(DL, CurOrigArg->getType()); + + // We can only hanlde simple value types at the moment. + if (!ValEVT.isSimple()) + return false; + MVT ValVT = ValEVT.getSimpleVT(); ISD::ArgFlagsTy Flags; + ArgInfo OrigArg{VRegs[i], CurOrigArg->getType()}; + setArgFlags(OrigArg, i + 1, DL, F); Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType())); CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false); bool Res = - AssignFn(i, ValVT, ValVT, CCValAssign::Full, Flags, CCInfo); - assert(!Res && "Call operand has unhandled type"); - (void)Res; + AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo); + + // Fail if we don't know how to handle this type. + if (Res) + return false; } Function::const_arg_iterator Arg = F.arg_begin(); + + if (F.getCallingConv() == CallingConv::AMDGPU_VS) { + for (unsigned i = 0; i != NumArgs; ++i, ++Arg) { + CCValAssign &VA = ArgLocs[i]; + MRI.addLiveIn(VA.getLocReg(), VRegs[i]); + MIRBuilder.getMBB().addLiveIn(VA.getLocReg()); + MIRBuilder.buildCopy(VRegs[i], VA.getLocReg()); + } + return true; + } + for (unsigned i = 0; i != NumArgs; ++i, ++Arg) { // FIXME: We should be getting DebugInfo from the arguments some how. CCValAssign &VA = ArgLocs[i]; diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td index 4bef7a89bfe3..c1c066fd1404 100644 --- a/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -163,6 +163,10 @@ def CC_AMDGPU : CallingConv<[ "AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_SI>>, CCIf<"static_cast<const AMDGPUSubtarget&>" + "(State.getMachineFunction().getSubtarget()).getGeneration() >= " + "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C", + CCDelegateTo<CC_AMDGPU_Func>>, + CCIf<"static_cast<const AMDGPUSubtarget&>" "(State.getMachineFunction().getSubtarget()).getGeneration() < " "AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_R600>> diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 31ee9206ae27..b17b67167666 100644 --- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -18,6 +18,7 @@ #include "AMDGPUTargetMachine.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/Loads.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" @@ -53,6 +54,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass, DivergenceAnalysis *DA = nullptr; Module *Mod = nullptr; bool HasUnsafeFPMath = false; + AMDGPUAS AMDGPUASI; /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to /// binary operation \p V. @@ -123,6 +125,15 @@ class AMDGPUCodeGenPrepare : public FunctionPass, /// /// \returns True. bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; + /// \brief Widen a scalar load. + /// + /// \details \p Widen scalar load for uniform, small type loads from constant + // memory / to a full 32-bits and then truncate the input to allow a scalar + // load instead of a vector load. + // + /// \returns True. + + bool canWidenScalarExtLoad(LoadInst &I) const; public: static char ID; @@ -133,6 +144,7 @@ public: bool visitInstruction(Instruction &I) { return false; } bool visitBinaryOperator(BinaryOperator &I); + bool visitLoadInst(LoadInst &I); bool visitICmpInst(ICmpInst &I); bool visitSelectInst(SelectInst &I); @@ -223,6 +235,16 @@ static bool promotedOpIsNUW(const Instruction &I) { } } +bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const { + Type *Ty = I.getType(); + const DataLayout &DL = Mod->getDataLayout(); + int TySize = DL.getTypeSizeInBits(Ty); + unsigned Align = I.getAlignment() ? + I.getAlignment() : DL.getABITypeAlignment(Ty); + + return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I); +} + bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { assert(needsPromotionToI32(I.getType()) && "I does not need promotion to i32"); @@ -378,7 +400,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { return false; FastMathFlags FMF = FPOp->getFastMathFlags(); - bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || + bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() || FMF.allowReciprocal(); // With UnsafeDiv node will be optimized to just rcp and mul. @@ -443,6 +465,29 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { return Changed; } +bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { + if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && + canWidenScalarExtLoad(I)) { + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *I32Ty = Builder.getInt32Ty(); + Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); + Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); + Value *WidenLoad = Builder.CreateLoad(BitCast); + + int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); + Type *IntNTy = Builder.getIntNTy(TySize); + Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); + Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); + I.replaceAllUsesWith(ValOrig); + I.eraseFromParent(); + return true; + } + + return false; +} + bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { bool Changed = false; diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h index 8e187c7e56c1..91fe921bfeec 100644 --- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -15,7 +15,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { @@ -33,10 +33,6 @@ public: /// \returns The number of 32-bit sub-registers that are used when storing /// values to the stack. unsigned getStackWidth(const MachineFunction &MF) const; - - bool hasFP(const MachineFunction &MF) const override { - return false; - } }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def index 5cb9036f4823..bf7deb500d1a 100644 --- a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def +++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def @@ -11,10 +11,6 @@ /// \todo This should be generated by TableGen. //===----------------------------------------------------------------------===// -#ifndef LLVM_BUILD_GLOBAL_ISEL -#error "You shouldn't build this" -#endif - namespace llvm { namespace AMDGPU { diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index f235313e4853..f4776adb069c 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -13,10 +13,12 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUArgumentUsageInfo.h" #include "AMDGPUISelLowering.h" // For AMDGPUISD #include "AMDGPUInstrInfo.h" #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" #include "SIDefines.h" #include "SIISelLowering.h" #include "SIInstrInfo.h" @@ -68,19 +70,30 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { // make the right decision when generating code for different targets. const AMDGPUSubtarget *Subtarget; AMDGPUAS AMDGPUASI; + bool EnableLateStructurizeCFG; public: - explicit AMDGPUDAGToDAGISel(TargetMachine &TM, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(TM, OptLevel){ - AMDGPUASI = AMDGPU::getAMDGPUAS(TM); + explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr, + CodeGenOpt::Level OptLevel = CodeGenOpt::Default) + : SelectionDAGISel(*TM, OptLevel) { + AMDGPUASI = AMDGPU::getAMDGPUAS(*TM); + EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG; } ~AMDGPUDAGToDAGISel() override = default; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AMDGPUArgumentUsageInfo>(); + SelectionDAGISel::getAnalysisUsage(AU); + } + bool runOnMachineFunction(MachineFunction &MF) override; void Select(SDNode *N) override; StringRef getPassName() const override; void PostprocessISelDAG() override; +protected: + void SelectBuildVector(SDNode *N, unsigned RegClassID); + private: std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const; bool isNoNanSrc(SDValue N) const; @@ -99,8 +112,8 @@ private: bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, SDValue& Offset); - bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); - bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); + virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); + virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); bool isDSOffsetLegal(const SDValue &Base, unsigned Offset, unsigned OffsetBits) const; bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; @@ -116,10 +129,10 @@ private: bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &SLC) const; - bool SelectMUBUFScratchOffen(SDNode *Root, + bool SelectMUBUFScratchOffen(SDNode *Parent, SDValue Addr, SDValue &RSrc, SDValue &VAddr, SDValue &SOffset, SDValue &ImmOffset) const; - bool SelectMUBUFScratchOffset(SDNode *Root, + bool SelectMUBUFScratchOffset(SDNode *Parent, SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset) const; @@ -140,6 +153,10 @@ private: bool SelectFlatAtomic(SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const; + bool SelectFlatAtomicSigned(SDValue Addr, SDValue &VAddr, + SDValue &Offset, SDValue &SLC) const; + + template <bool IsSigned> bool SelectFlatOffset(SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const; @@ -152,10 +169,10 @@ private: bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const; bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; - bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const; bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, @@ -174,9 +191,22 @@ private: bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp) const; + bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3OpSel0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp) const; + + bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3OpSelMods0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp) const; + bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const; + bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + + bool SelectHi16Elt(SDValue In, SDValue &Src) const; + void SelectADD_SUB_I64(SDNode *N); void SelectUADDO_USUBO(SDNode *N); void SelectDIV_SCALE(SDNode *N); + void SelectMAD_64_32(SDNode *N); void SelectFMA_W_CHAIN(SDNode *N); void SelectFMUL_W_CHAIN(SDNode *N); @@ -186,21 +216,49 @@ private: void SelectS_BFE(SDNode *N); bool isCBranchSCC(const SDNode *N) const; void SelectBRCOND(SDNode *N); + void SelectFMAD(SDNode *N); void SelectATOMIC_CMP_SWAP(SDNode *N); +protected: // Include the pieces autogenerated from the target description. #include "AMDGPUGenDAGISel.inc" }; +class R600DAGToDAGISel : public AMDGPUDAGToDAGISel { +public: + explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) : + AMDGPUDAGToDAGISel(TM, OptLevel) {} + + void Select(SDNode *N) override; + + bool SelectADDRIndirect(SDValue Addr, SDValue &Base, + SDValue &Offset) override; + bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, + SDValue &Offset) override; +}; + } // end anonymous namespace +INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "isel", + "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) +INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo) +INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "isel", + "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) + /// \brief This pass converts a legalized DAG into a AMDGPU-specific // DAG, ready for instruction scheduling. -FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM, +FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel) { return new AMDGPUDAGToDAGISel(TM, OptLevel); } +/// \brief This pass converts a legalized DAG into a R600-specific +// DAG, ready for instruction scheduling. +FunctionPass *llvm::createR600ISelDag(TargetMachine *TM, + CodeGenOpt::Level OptLevel) { + return new R600DAGToDAGISel(TM, OptLevel); +} + bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { Subtarget = &MF.getSubtarget<AMDGPUSubtarget>(); return SelectionDAGISel::runOnMachineFunction(MF); @@ -279,8 +337,8 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, } SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || - cast<MemSDNode>(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS) + if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS || + !Subtarget->ldsRequiresM0Init()) return N; const SITargetLowering& Lowering = @@ -298,9 +356,7 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { Ops.push_back(N->getOperand(i)); } Ops.push_back(Glue); - CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); - - return N; + return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); } static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { @@ -334,6 +390,58 @@ static bool getConstantValue(SDValue N, uint32_t &Out) { return false; } +void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { + EVT VT = N->getValueType(0); + unsigned NumVectorElts = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo(); + SDLoc DL(N); + SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); + + if (NumVectorElts == 1) { + CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0), + RegClass); + return; + } + + assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not " + "supported yet"); + // 16 = Max Num Vector Elements + // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) + // 1 = Vector Register Class + SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1); + + RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); + bool IsRegSeq = true; + unsigned NOps = N->getNumOperands(); + for (unsigned i = 0; i < NOps; i++) { + // XXX: Why is this here? + if (isa<RegisterSDNode>(N->getOperand(i))) { + IsRegSeq = false; + break; + } + RegSeqArgs[1 + (2 * i)] = N->getOperand(i); + RegSeqArgs[1 + (2 * i) + 1] = + CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, + MVT::i32); + } + if (NOps != NumVectorElts) { + // Fill in the missing undef elements if this was a scalar_to_vector. + assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts); + MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + DL, EltVT); + for (unsigned i = NOps; i < NumVectorElts; ++i) { + RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); + RegSeqArgs[1 + (2 * i) + 1] = + CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32); + } + } + + if (!IsRegSeq) + SelectCode(N); + CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs); +} + void AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { @@ -346,18 +454,16 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { N = glueCopyToM0(N); switch (Opc) { - default: break; + default: + break; // We are selecting i64 ADD here instead of custom lower it during // DAG legalization, so we can fold some i64 ADDs used for address // calculation into the LOAD and STORE instructions. - case ISD::ADD: case ISD::ADDC: case ISD::ADDE: - case ISD::SUB: case ISD::SUBC: case ISD::SUBE: { - if (N->getValueType(0) != MVT::i64 || - Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + if (N->getValueType(0) != MVT::i64) break; SelectADD_SUB_I64(N); @@ -378,13 +484,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { } case ISD::SCALAR_TO_VECTOR: - case AMDGPUISD::BUILD_VERTICAL_VECTOR: case ISD::BUILD_VECTOR: { - unsigned RegClassID; - const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo(); EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); - EVT EltVT = VT.getVectorElementType(); if (VT == MVT::v2i16 || VT == MVT::v2f16) { if (Opc == ISD::BUILD_VECTOR) { @@ -401,81 +503,13 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { break; } - assert(EltVT.bitsEq(MVT::i32)); - - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - RegClassID = selectSGPRVectorRegClassID(NumVectorElts); - } else { - // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG - // that adds a 128 bits reg copy when going through TwoAddressInstructions - // pass. We want to avoid 128 bits copies as much as possible because they - // can't be bundled by our scheduler. - switch(NumVectorElts) { - case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break; - case 4: - if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR) - RegClassID = AMDGPU::R600_Reg128VerticalRegClassID; - else - RegClassID = AMDGPU::R600_Reg128RegClassID; - break; - default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); - } - } - - SDLoc DL(N); - SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); - - if (NumVectorElts == 1) { - CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0), - RegClass); - return; - } - - assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not " - "supported yet"); - // 16 = Max Num Vector Elements - // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) - // 1 = Vector Register Class - SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1); - - RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); - bool IsRegSeq = true; - unsigned NOps = N->getNumOperands(); - for (unsigned i = 0; i < NOps; i++) { - // XXX: Why is this here? - if (isa<RegisterSDNode>(N->getOperand(i))) { - IsRegSeq = false; - break; - } - RegSeqArgs[1 + (2 * i)] = N->getOperand(i); - RegSeqArgs[1 + (2 * i) + 1] = - CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, - MVT::i32); - } - - if (NOps != NumVectorElts) { - // Fill in the missing undef elements if this was a scalar_to_vector. - assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts); - - MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, - DL, EltVT); - for (unsigned i = NOps; i < NumVectorElts; ++i) { - RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); - RegSeqArgs[1 + (2 * i) + 1] = - CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32); - } - } - - if (!IsRegSeq) - break; - CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs); + assert(VT.getVectorElementType().bitsEq(MVT::i32)); + unsigned RegClassID = selectSGPRVectorRegClassID(NumVectorElts); + SelectBuildVector(N, RegClassID); return; } case ISD::BUILD_PAIR: { SDValue RC, SubReg0, SubReg1; - if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - break; - } SDLoc DL(N); if (N->getValueType(0) == MVT::i128) { RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32); @@ -497,8 +531,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::Constant: case ISD::ConstantFP: { - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || - N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N)) + if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N)) break; uint64_t Imm; @@ -533,9 +566,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) - break; - // There is a scalar version available, but unlike the vector version which // has a separate operand for the offset and width, the scalar version packs // the width and offset into a single operand. Try to move to the scalar @@ -565,6 +595,11 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectDIV_SCALE(N); return; } + case AMDGPUISD::MAD_I64_I32: + case AMDGPUISD::MAD_U64_U32: { + SelectMAD_64_32(N); + return; + } case ISD::CopyToReg: { const SITargetLowering& Lowering = *static_cast<const SITargetLowering*>(getTargetLowering()); @@ -575,8 +610,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::SRL: case ISD::SRA: case ISD::SIGN_EXTEND_INREG: - if (N->getValueType(0) != MVT::i32 || - Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + if (N->getValueType(0) != MVT::i32) break; SelectS_BFE(N); @@ -584,7 +618,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::BRCOND: SelectBRCOND(N); return; - + case ISD::FMAD: + SelectFMAD(N); + return; case AMDGPUISD::ATOMIC_CMP_SWAP: SelectATOMIC_CMP_SWAP(N); return; @@ -638,32 +674,8 @@ bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, } bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, - SDValue &Offset) { - ConstantSDNode *IMMOffset; - - if (Addr.getOpcode() == ISD::ADD - && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) - && isInt<16>(IMMOffset->getZExtValue())) { - - Base = Addr.getOperand(0); - Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), - MVT::i32); - return true; - // If the pointer address is constant, we can move it to the offset field. - } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr)) - && isInt<16>(IMMOffset->getZExtValue())) { - Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), - SDLoc(CurDAG->getEntryNode()), - AMDGPU::ZERO, MVT::i32); - Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), - MVT::i32); - return true; - } - - // Default case, no offset - Base = Addr; - Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); - return true; + SDValue &Offset) { + return false; } bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, @@ -690,6 +702,7 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, return true; } +// FIXME: Should only handle addcarry/subcarry void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { SDLoc DL(N); SDValue LHS = N->getOperand(0); @@ -699,8 +712,7 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE); bool ProduceCarry = ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC; - bool IsAdd = - (Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE); + bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE; SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); @@ -782,7 +794,7 @@ void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) { void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) { SDLoc SL(N); - // src0_modifiers, src0, src1_modifiers, src1, clamp, omod + // src0_modifiers, src0, src1_modifiers, src1, clamp, omod SDValue Ops[8]; SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]); @@ -808,6 +820,19 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } +// We need to handle this here because tablegen doesn't support matching +// instructions with multiple outputs. +void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { + SDLoc SL(N); + bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32; + unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32 : AMDGPU::V_MAD_U64_U32; + + SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + Clamp }; + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); +} + bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, unsigned OffsetBits) const { if ((OffsetBits == 16 && !isUInt<16>(Offset)) || @@ -850,8 +875,12 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, Zero, Addr.getOperand(1)); if (isDSOffsetLegal(Sub, ByteOffset, 16)) { + // FIXME: Select to VOP3 version for with-carry. + unsigned SubOp = Subtarget->hasAddNoCarry() ? + AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; + MachineSDNode *MachineSub - = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32, + = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Zero, Addr.getOperand(1)); Base = SDValue(MachineSub, 0); @@ -920,8 +949,11 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, Zero, Addr.getOperand(1)); if (isDSOffsetLegal(Sub, DWordOffset1, 8)) { + unsigned SubOp = Subtarget->hasAddNoCarry() ? + AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; + MachineSDNode *MachineSub - = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32, + = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Zero, Addr.getOperand(1)); Base = SDValue(MachineSub, 0); @@ -958,14 +990,6 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, return true; } -static bool isLegalMUBUFImmOffset(unsigned Imm) { - return isUInt<12>(Imm); -} - -static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) { - return isLegalMUBUFImmOffset(Imm->getZExtValue()); -} - bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, @@ -1007,7 +1031,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, Ptr = N0; } - if (isLegalMUBUFImmOffset(C1)) { + if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) { Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); return true; } @@ -1104,7 +1128,7 @@ std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const MVT::i32)); } -bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root, +bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, SDValue Addr, SDValue &Rsrc, SDValue &VAddr, SDValue &SOffset, SDValue &ImmOffset) const { @@ -1117,8 +1141,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root, if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { unsigned Imm = CAddr->getZExtValue(); - assert(!isLegalMUBUFImmOffset(Imm) && - "should have been selected by other pattern"); SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32); MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, @@ -1127,7 +1149,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root, // In a call sequence, stores to the argument stack area are relative to the // stack pointer. - const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo(); + const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo(); unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); @@ -1142,9 +1164,25 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root, SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); - // Offsets in vaddr must be positive. + // Offsets in vaddr must be positive if range checking is enabled. + // + // The total computation of vaddr + soffset + offset must not overflow. If + // vaddr is negative, even if offset is 0 the sgpr offset add will end up + // overflowing. + // + // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would + // always perform a range check. If a negative vaddr base index was used, + // this would fail the range check. The overall address computation would + // compute a valid address, but this doesn't happen due to the range + // check. For out-of-bounds MUBUF loads, a 0 is returned. + // + // Therefore it should be safe to fold any VGPR offset on gfx9 into the + // MUBUF vaddr, but not on older subtargets which can only do this if the + // sign bit is known 0. ConstantSDNode *C1 = cast<ConstantSDNode>(N1); - if (isLegalMUBUFImmOffset(C1)) { + if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) && + (!Subtarget->privateMemoryResourceIsRangeChecked() || + CurDAG->SignBitIsZero(N0))) { std::tie(VAddr, SOffset) = foldFrameIndex(N0); ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); return true; @@ -1157,13 +1195,13 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root, return true; } -bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Root, +bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset) const { ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr); - if (!CAddr || !isLegalMUBUFImmOffset(CAddr)) + if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) return false; SDLoc DL(Addr); @@ -1172,7 +1210,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Root, SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); - const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo(); + const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo(); unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); @@ -1231,24 +1269,30 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant, SDValue &SOffset, SDValue &ImmOffset) const { SDLoc DL(Constant); + const uint32_t Align = 4; + const uint32_t MaxImm = alignDown(4095, Align); uint32_t Imm = cast<ConstantSDNode>(Constant)->getZExtValue(); uint32_t Overflow = 0; - if (Imm >= 4096) { - if (Imm <= 4095 + 64) { - // Use an SOffset inline constant for 1..64 - Overflow = Imm - 4095; - Imm = 4095; + if (Imm > MaxImm) { + if (Imm <= MaxImm + 64) { + // Use an SOffset inline constant for 4..64 + Overflow = Imm - MaxImm; + Imm = MaxImm; } else { // Try to keep the same value in SOffset for adjacent loads, so that // the corresponding register contents can be re-used. // - // Load values with all low-bits set into SOffset, so that a larger - // range of values can be covered using s_movk_i32 - uint32_t High = (Imm + 1) & ~4095; - uint32_t Low = (Imm + 1) & 4095; + // Load values with all low-bits (except for alignment bits) set into + // SOffset, so that a larger range of values can be covered using + // s_movk_i32. + // + // Atomic operations fail to work correctly when individual address + // components are unaligned, even if their sum is aligned. + uint32_t High = (Imm + Align) & ~4095; + uint32_t Low = (Imm + Align) & 4095; Imm = Low; - Overflow = High - 1; + Overflow = High - Align; } } @@ -1316,6 +1360,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset, return true; } +template <bool IsSigned> bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr, SDValue &VAddr, SDValue &Offset, @@ -1326,8 +1371,10 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr, CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); - uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getZExtValue(); - if (isUInt<12>(COffsetVal)) { + int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); + + if ((IsSigned && isInt<13>(COffsetVal)) || + (!IsSigned && isUInt<12>(COffsetVal))) { Addr = N0; OffsetVal = COffsetVal; } @@ -1344,7 +1391,14 @@ bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const { - return SelectFlatOffset(Addr, VAddr, Offset, SLC); + return SelectFlatOffset<false>(Addr, VAddr, Offset, SLC); +} + +bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDValue Addr, + SDValue &VAddr, + SDValue &Offset, + SDValue &SLC) const { + return SelectFlatOffset<true>(Addr, VAddr, Offset, SLC); } bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, @@ -1443,13 +1497,6 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr, return !Imm && isa<ConstantSDNode>(Offset); } -bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr, - SDValue &Offset) const { - bool Imm; - return SelectSMRDOffset(Addr, Offset, Imm) && !Imm && - !isa<ConstantSDNode>(Offset); -} - bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const { @@ -1622,18 +1669,55 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { return; } - if (isCBranchSCC(N)) { - // This brcond will use S_CBRANCH_SCC*, so let tablegen handle it. + bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N); + unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ; + unsigned CondReg = UseSCCBr ? AMDGPU::SCC : AMDGPU::VCC; + SDLoc SL(N); + + SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond); + CurDAG->SelectNodeTo(N, BrOp, MVT::Other, + N->getOperand(2), // Basic Block + VCC.getValue(0)); +} + +void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) { + MVT VT = N->getSimpleValueType(0); + if (VT != MVT::f32 || !Subtarget->hasMadMixInsts()) { SelectCode(N); return; } - SDLoc SL(N); + SDValue Src0 = N->getOperand(0); + SDValue Src1 = N->getOperand(1); + SDValue Src2 = N->getOperand(2); + unsigned Src0Mods, Src1Mods, Src2Mods; + + // Avoid using v_mad_mix_f32 unless there is actually an operand using the + // conversion from f16. + bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods); + bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods); + bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods); + + assert(!Subtarget->hasFP32Denormals() && + "fmad selected with denormals enabled"); + // TODO: We can select this with f32 denormals enabled if all the sources are + // converted from f16 (in which case fmad isn't legal). + + if (Sel0 || Sel1 || Sel2) { + // For dummy operands. + SDValue Zero = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); + SDValue Ops[] = { + CurDAG->getTargetConstant(Src0Mods, SDLoc(), MVT::i32), Src0, + CurDAG->getTargetConstant(Src1Mods, SDLoc(), MVT::i32), Src1, + CurDAG->getTargetConstant(Src2Mods, SDLoc(), MVT::i32), Src2, + CurDAG->getTargetConstant(0, SDLoc(), MVT::i1), + Zero, Zero + }; - SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, AMDGPU::VCC, Cond); - CurDAG->SelectNodeTo(N, AMDGPU::S_CBRANCH_VCCNZ, MVT::Other, - N->getOperand(2), // Basic Block - VCC.getValue(0)); + CurDAG->SelectNodeTo(N, AMDGPU::V_MAD_MIX_F32, MVT::f32, Ops); + } else { + SelectCode(N); + } } // This is here because there isn't a way to use the generated sub0_sub1 as the @@ -1652,11 +1736,11 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { MachineSDNode *CmpSwap = nullptr; if (Subtarget->hasAddr64()) { - SDValue SRsrc, VAddr, SOffset, Offset, GLC, SLC; + SDValue SRsrc, VAddr, SOffset, Offset, SLC; if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) { - unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_ADDR64 : - AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_ADDR64; + unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN : + AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN; SDValue CmpVal = Mem->getOperand(2); // XXX - Do we care about glue operands? @@ -1672,8 +1756,8 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { if (!CmpSwap) { SDValue SRsrc, SOffset, Offset, SLC; if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) { - unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET : - AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_OFFSET; + unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN : + AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN; SDValue CmpVal = Mem->getOperand(2); SDValue Ops[] = { @@ -1702,9 +1786,9 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { CurDAG->RemoveDeadNode(N); } -bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, - SDValue &SrcMods) const { - unsigned Mods = 0; +bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, + unsigned &Mods) const { + Mods = 0; Src = In; if (Src.getOpcode() == ISD::FNEG) { @@ -1717,10 +1801,20 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, Src = Src.getOperand(0); } - SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; } +bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods; + if (SelectVOP3ModsImpl(In, Src, Mods)) { + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; + } + + return false; +} + bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const { SelectVOP3Mods(In, Src, SrcMods); @@ -1864,24 +1958,234 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src, return SelectVOP3PMods(In, Src, SrcMods); } +bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + Src = In; + // FIXME: Handle op_sel + SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3OpSel0(SDValue In, SDValue &Src, + SDValue &SrcMods, + SDValue &Clamp) const { + SDLoc SL(In); + + // FIXME: Handle clamp + Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); + + return SelectVOP3OpSel(In, Src, SrcMods); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + // FIXME: Handle op_sel + return SelectVOP3Mods(In, Src, SrcMods); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods0(SDValue In, SDValue &Src, + SDValue &SrcMods, + SDValue &Clamp) const { + SDLoc SL(In); + + // FIXME: Handle clamp + Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); + + return SelectVOP3OpSelMods(In, Src, SrcMods); +} + +// The return value is not whether the match is possible (which it always is), +// but whether or not it a conversion is really used. +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, + unsigned &Mods) const { + Mods = 0; + SelectVOP3ModsImpl(In, Src, Mods); + + if (Src.getOpcode() == ISD::FP_EXTEND) { + Src = Src.getOperand(0); + assert(Src.getValueType() == MVT::f16); + Src = stripBitcast(Src); + + // Be careful about folding modifiers if we already have an abs. fneg is + // applied last, so we don't want to apply an earlier fneg. + if ((Mods & SISrcMods::ABS) == 0) { + unsigned ModsTmp; + SelectVOP3ModsImpl(Src, Src, ModsTmp); + + if ((ModsTmp & SISrcMods::NEG) != 0) + Mods ^= SISrcMods::NEG; + + if ((ModsTmp & SISrcMods::ABS) != 0) + Mods |= SISrcMods::ABS; + } + + // op_sel/op_sel_hi decide the source type and source. + // If the source's op_sel_hi is set, it indicates to do a conversion from fp16. + // If the sources's op_sel is set, it picks the high half of the source + // register. + + Mods |= SISrcMods::OP_SEL_1; + if (isExtractHiElt(Src, Src)) { + Mods |= SISrcMods::OP_SEL_0; + + // TODO: Should we try to look for neg/abs here? + } + + return true; + } + + return false; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + SelectVOP3PMadMixModsImpl(In, Src, Mods); + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +// TODO: Can we identify things like v_mad_mixhi_f16? +bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const { + if (In.isUndef()) { + Src = In; + return true; + } + + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) { + SDLoc SL(In); + SDValue K = CurDAG->getTargetConstant(C->getZExtValue() << 16, SL, MVT::i32); + MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + SL, MVT::i32, K); + Src = SDValue(MovK, 0); + return true; + } + + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) { + SDLoc SL(In); + SDValue K = CurDAG->getTargetConstant( + C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32); + MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + SL, MVT::i32, K); + Src = SDValue(MovK, 0); + return true; + } + + return isExtractHiElt(In, Src); +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); bool IsModified = false; do { IsModified = false; + // Go over all selected nodes and try to fold them a bit more - for (SDNode &Node : CurDAG->allnodes()) { - MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(&Node); + SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin(); + while (Position != CurDAG->allnodes_end()) { + SDNode *Node = &*Position++; + MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node); if (!MachineNode) continue; SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG); - if (ResNode != &Node) { - ReplaceUses(&Node, ResNode); + if (ResNode != Node) { + if (ResNode) + ReplaceUses(Node, ResNode); IsModified = true; } } CurDAG->RemoveDeadNodes(); } while (IsModified); } + +void R600DAGToDAGISel::Select(SDNode *N) { + unsigned int Opc = N->getOpcode(); + if (N->isMachineOpcode()) { + N->setNodeId(-1); + return; // Already selected. + } + + switch (Opc) { + default: break; + case AMDGPUISD::BUILD_VERTICAL_VECTOR: + case ISD::SCALAR_TO_VECTOR: + case ISD::BUILD_VECTOR: { + EVT VT = N->getValueType(0); + unsigned NumVectorElts = VT.getVectorNumElements(); + unsigned RegClassID; + // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG + // that adds a 128 bits reg copy when going through TwoAddressInstructions + // pass. We want to avoid 128 bits copies as much as possible because they + // can't be bundled by our scheduler. + switch(NumVectorElts) { + case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break; + case 4: + if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR) + RegClassID = AMDGPU::R600_Reg128VerticalRegClassID; + else + RegClassID = AMDGPU::R600_Reg128RegClassID; + break; + default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); + } + SelectBuildVector(N, RegClassID); + return; + } + } + + SelectCode(N); +} + +bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, + SDValue &Offset) { + ConstantSDNode *C; + SDLoc DL(Addr); + + if ((C = dyn_cast<ConstantSDNode>(Addr))) { + Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); + } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) && + (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) { + Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); + } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && + (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) { + Base = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); + } else { + Base = Addr; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); + } + + return true; +} + +bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, + SDValue &Offset) { + ConstantSDNode *IMMOffset; + + if (Addr.getOpcode() == ISD::ADD + && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) + && isInt<16>(IMMOffset->getZExtValue())) { + + Base = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), + MVT::i32); + return true; + // If the pointer address is constant, we can move it to the offset field. + } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr)) + && isInt<16>(IMMOffset->getZExtValue())) { + Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), + SDLoc(CurDAG->getEntryNode()), + AMDGPU::ZERO, MVT::i32); + Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), + MVT::i32); + return true; + } + + // Default case, no offset + Base = Addr; + Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); + return true; +} diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 258b1737deb3..49929441ef21 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -13,6 +13,10 @@ // //===----------------------------------------------------------------------===// +#define AMDGPU_LOG2E_F 1.44269504088896340735992468100189214f +#define AMDGPU_LN2_F 0.693147180559945309417232121458176568f +#define AMDGPU_LN10_F 2.30258509299404568401799145468436421f + #include "AMDGPUISelLowering.h" #include "AMDGPU.h" #include "AMDGPUCallLowering.h" @@ -20,6 +24,7 @@ #include "AMDGPUIntrinsicInfo.h" #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" #include "R600MachineFunctionInfo.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" @@ -127,27 +132,20 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); } -bool AMDGPUTargetLowering::isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op) -{ - assert(Op.getOpcode() == ISD::OR); - - SDValue N0 = Op->getOperand(0); - SDValue N1 = Op->getOperand(1); - EVT VT = N0.getValueType(); - - if (VT.isInteger() && !VT.isVector()) { - KnownBits LHSKnown, RHSKnown; - DAG.computeKnownBits(N0, LHSKnown); +unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) { + KnownBits Known; + EVT VT = Op.getValueType(); + DAG.computeKnownBits(Op, Known); - if (LHSKnown.Zero.getBoolValue()) { - DAG.computeKnownBits(N1, RHSKnown); + return VT.getSizeInBits() - Known.countMinLeadingZeros(); +} - if (!(~RHSKnown.Zero & ~LHSKnown.Zero)) - return true; - } - } +unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); - return false; + // In order for this to be a signed 24-bit value, bit 23, must + // be a sign bit. + return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op); } AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, @@ -323,6 +321,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FROUND, MVT::f32, Custom); setOperationAction(ISD::FROUND, MVT::f64, Custom); + setOperationAction(ISD::FLOG, MVT::f32, Custom); + setOperationAction(ISD::FLOG10, MVT::f32, Custom); + + if (Subtarget->has16BitInsts()) { + setOperationAction(ISD::FLOG, MVT::f16, Custom); + setOperationAction(ISD::FLOG10, MVT::f16, Custom); + } + setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); @@ -399,8 +405,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::MUL, MVT::i64, Expand); setOperationAction(ISD::MULHU, MVT::i64, Expand); setOperationAction(ISD::MULHS, MVT::i64, Expand); - setOperationAction(ISD::UDIV, MVT::i32, Expand); - setOperationAction(ISD::UREM, MVT::i32, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); @@ -416,8 +420,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); if (Subtarget->hasFFBL()) - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom); + setOperationAction(ISD::CTTZ, MVT::i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom); setOperationAction(ISD::CTLZ, MVT::i64, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); @@ -475,6 +481,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); + setOperationAction(ISD::SETCC, VT, Expand); } static const MVT::SimpleValueType FloatVectorTypes[] = { @@ -492,6 +499,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FEXP2, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::FLOG, VT, Expand); + setOperationAction(ISD::FLOG10, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); @@ -507,6 +516,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); + setOperationAction(ISD::SETCC, VT, Expand); } // This causes using an unrolled select operation rather than expansion with @@ -822,6 +832,17 @@ bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return isZExtFree(Val.getValueType(), VT2); } +// v_mad_mix* support a conversion from f16 to f32. +// +// There is only one special case when denormals are enabled we don't currently, +// where this is OK to use. +bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode, + EVT DestVT, EVT SrcVT) const { + return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() && + DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() && + SrcVT.getScalarType() == MVT::f16; +} + bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { // There aren't really 64-bit registers, but pairs of 32-bit ones and only a // limited number of native 64-bit operations. Shrinking an operation to fit @@ -847,9 +868,12 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_ES: + case CallingConv::AMDGPU_LS: return CC_AMDGPU; case CallingConv::C: case CallingConv::Fast: + case CallingConv::Cold: return CC_AMDGPU_Func; default: report_fatal_error("Unsupported calling convention."); @@ -867,9 +891,12 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_ES: + case CallingConv::AMDGPU_LS: return RetCC_SI_Shader; case CallingConv::C: case CallingConv::Fast: + case CallingConv::Cold: return RetCC_AMDGPU_Func; default: report_fatal_error("Unsupported calling convention."); @@ -1000,12 +1027,49 @@ CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg); } -SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, - SmallVectorImpl<SDValue> &InVals) const { +SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain, + SelectionDAG &DAG, + MachineFrameInfo &MFI, + int ClobberedFI) const { + SmallVector<SDValue, 8> ArgChains; + int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); + int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; + + // Include the original chain at the beginning of the list. When this is + // used by target LowerCall hooks, this helps legalize find the + // CALLSEQ_BEGIN node. + ArgChains.push_back(Chain); + + // Add a chain value for each stack argument corresponding + for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), + UE = DAG.getEntryNode().getNode()->use_end(); + U != UE; ++U) { + if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) { + if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) { + if (FI->getIndex() < 0) { + int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); + int64_t InLastByte = InFirstByte; + InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; + + if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || + (FirstByte <= InFirstByte && InFirstByte <= LastByte)) + ArgChains.push_back(SDValue(L, 1)); + } + } + } + } + + // Build a tokenfactor for all the chains. + return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); +} + +SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals, + StringRef Reason) const { SDValue Callee = CLI.Callee; SelectionDAG &DAG = CLI.DAG; - const Function &Fn = *DAG.getMachineFunction().getFunction(); + const Function &Fn = DAG.getMachineFunction().getFunction(); StringRef FuncName("<unknown>"); @@ -1015,7 +1079,7 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, FuncName = G->getGlobal()->getName(); DiagnosticInfoUnsupported NoCalls( - Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc()); + Fn, Reason + FuncName, CLI.DL.getDebugLoc()); DAG.getContext()->diagnose(NoCalls); if (!CLI.IsTailCall) { @@ -1026,9 +1090,14 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, return DAG.getEntryNode(); } +SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const { + return lowerUnhandledCall(CLI, InVals, "unsupported call to function "); +} + SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { - const Function &Fn = *DAG.getMachineFunction().getFunction(); + const Function &Fn = DAG.getMachineFunction().getFunction(); DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()); @@ -1057,14 +1126,20 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); + case ISD::FLOG: + return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F); + case ISD::FLOG10: + return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::CTTZ: + case ISD::CTTZ_ZERO_UNDEF: case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: - return LowerCTLZ(Op, DAG); + return LowerCTLZ_CTTZ(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); } return Op; @@ -1115,7 +1190,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, } } - const Function &Fn = *DAG.getMachineFunction().getFunction(); + const Function &Fn = DAG.getMachineFunction().getFunction(); DiagnosticInfoUnsupported BadInit( Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc()); DAG.getContext()->diagnose(BadInit); @@ -1261,7 +1336,6 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, return scalarizeVectorLoad(Load, DAG); SDValue BasePtr = Load->getBasePtr(); - EVT PtrVT = BasePtr.getValueType(); EVT MemVT = Load->getMemoryVT(); SDLoc SL(Op); @@ -1282,8 +1356,7 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue, LoMemVT, BaseAlign, Load->getMemOperand()->getFlags()); - SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(Size, SL, PtrVT)); + SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size); SDValue HiLoad = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), @@ -1322,10 +1395,7 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT); - EVT PtrVT = BasePtr.getValueType(); - SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(LoMemVT.getStoreSize(), SL, - PtrVT)); + SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize()); const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo(); unsigned BaseAlign = Store->getAlignment(); @@ -1454,49 +1524,181 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl<SDValue> &Results) const { - assert(Op.getValueType() == MVT::i64); - SDLoc DL(Op); EVT VT = Op.getValueType(); + + assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64"); + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); - SDValue one = DAG.getConstant(1, DL, HalfVT); - SDValue zero = DAG.getConstant(0, DL, HalfVT); + SDValue One = DAG.getConstant(1, DL, HalfVT); + SDValue Zero = DAG.getConstant(0, DL, HalfVT); //HiLo split SDValue LHS = Op.getOperand(0); - SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); - SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); + SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); + SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One); SDValue RHS = Op.getOperand(1); - SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); - SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); + SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); + SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One); - if (VT == MVT::i64 && - DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && - DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { + if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && + DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), LHS_Lo, RHS_Lo); - SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero}); - SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero}); + SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero}); + SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero}); Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV)); Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM)); return; } + if (isTypeLegal(MVT::i64)) { + // Compute denominator reciprocal. + unsigned FMAD = Subtarget->hasFP32Denormals() ? + (unsigned)AMDGPUISD::FMAD_FTZ : + (unsigned)ISD::FMAD; + + SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo); + SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi); + SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi, + DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32), + Cvt_Lo); + SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1); + SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp, + DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32)); + SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1, + DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32)); + SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2); + SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc, + DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32), + Mul1); + SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2); + SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc); + SDValue Rcp64 = DAG.getBitcast(VT, + DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi})); + + SDValue Zero64 = DAG.getConstant(0, DL, VT); + SDValue One64 = DAG.getConstant(1, DL, VT); + SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1); + SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1); + + SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS); + SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64); + SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1); + SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, + Zero); + SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, + One); + + SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo, + Mulhi1_Lo, Zero1); + SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi, + Mulhi1_Hi, Add1_Lo.getValue(1)); + SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi); + SDValue Add1 = DAG.getBitcast(VT, + DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi})); + + SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1); + SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2); + SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, + Zero); + SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, + One); + + SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo, + Mulhi2_Lo, Zero1); + SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc, + Mulhi2_Hi, Add1_Lo.getValue(1)); + SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC, + Zero, Add2_Lo.getValue(1)); + SDValue Add2 = DAG.getBitcast(VT, + DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi})); + SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2); + + SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3); + + SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero); + SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One); + SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo, + Mul3_Lo, Zero1); + SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi, + Mul3_Hi, Sub1_Lo.getValue(1)); + SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi); + SDValue Sub1 = DAG.getBitcast(VT, + DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi})); + + SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT); + SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero, + ISD::SETUGE); + SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero, + ISD::SETUGE); + SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ); + + // TODO: Here and below portions of the code can be enclosed into if/endif. + // Currently control flow is unconditional and we have 4 selects after + // potential endif to substitute PHIs. + + // if C3 != 0 ... + SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo, + RHS_Lo, Zero1); + SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi, + RHS_Hi, Sub1_Lo.getValue(1)); + SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi, + Zero, Sub2_Lo.getValue(1)); + SDValue Sub2 = DAG.getBitcast(VT, + DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi})); + + SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64); + + SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero, + ISD::SETUGE); + SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero, + ISD::SETUGE); + SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ); + + // if (C6 != 0) + SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64); + + SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo, + RHS_Lo, Zero1); + SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi, + RHS_Hi, Sub2_Lo.getValue(1)); + SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi, + Zero, Sub3_Lo.getValue(1)); + SDValue Sub3 = DAG.getBitcast(VT, + DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi})); + + // endif C6 + // endif C3 + + SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE); + SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE); + + SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE); + SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE); + + Results.push_back(Div); + Results.push_back(Rem); + + return; + } + + // r600 expandion. // Get Speculative values SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); - SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); - SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero}); + SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ); + SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero}); REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM); - SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); - SDValue DIV_Lo = zero; + SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ); + SDValue DIV_Lo = Zero; const unsigned halfBitWidth = HalfVT.getSizeInBits(); @@ -1505,7 +1707,7 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, SDValue POS = DAG.getConstant(bitPos, DL, HalfVT); // Get value of high bit SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); - HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); + HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One); HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit); // Shift @@ -1514,7 +1716,7 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit); SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT); - SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE); + SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE); DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); @@ -1971,13 +2173,45 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } -SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { +SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG, + double Log2BaseInverted) const { + EVT VT = Op.getValueType(); + + SDLoc SL(Op); + SDValue Operand = Op.getOperand(0); + SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand); + SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT); + + return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand); +} + +static bool isCtlzOpc(unsigned Opc) { + return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; +} + +static bool isCttzOpc(unsigned Opc) { + return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF; +} + +SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); - bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF; + bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF || + Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF; + + unsigned ISDOpc, NewOpc; + if (isCtlzOpc(Op.getOpcode())) { + ISDOpc = ISD::CTLZ_ZERO_UNDEF; + NewOpc = AMDGPUISD::FFBH_U32; + } else if (isCttzOpc(Op.getOpcode())) { + ISDOpc = ISD::CTTZ_ZERO_UNDEF; + NewOpc = AMDGPUISD::FFBL_B32; + } else + llvm_unreachable("Unexpected OPCode!!!"); + if (ZeroUndef && Src.getValueType() == MVT::i32) - return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src); + return DAG.getNode(NewOpc, SL, MVT::i32, Src); SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); @@ -1990,24 +2224,32 @@ SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); - SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ); + SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo; + SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ); - SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo); - SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi); + SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo); + SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi); const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32); - SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32); - - // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x)) - SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi); + SDValue Add, NewOpr; + if (isCtlzOpc(Op.getOpcode())) { + Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32); + // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x)) + NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi); + } else { + Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32); + // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x)) + NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo); + } if (!ZeroUndef) { // Test if the full 64-bit input is zero. // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32, // which we probably don't want. - SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ); - SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0); + SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi; + SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ); + SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0); // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction // with the same cycles, otherwise it is slower. @@ -2018,11 +2260,11 @@ SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { // The instruction returns -1 for 0 input, but the defined intrinsic // behavior is to return the number of bits. - NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, - SrcIsZero, Bits32, NewCtlz); + NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, + SrcIsZero, Bits32, NewOpr); } - return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz); + return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr); } SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, @@ -2389,21 +2631,14 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, //===----------------------------------------------------------------------===// static bool isU24(SDValue Op, SelectionDAG &DAG) { - KnownBits Known; - EVT VT = Op.getValueType(); - DAG.computeKnownBits(Op, Known); - - return (VT.getSizeInBits() - Known.countMinLeadingZeros()) <= 24; + return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24; } static bool isI24(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); - - // In order for this to be a signed 24-bit value, bit 23, must - // be a sign bit. return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated // as unsigned 24-bit values. - (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24; + AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24; } static bool simplifyI24(SDNode *Node24, unsigned OpIdx, @@ -2665,11 +2900,21 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, case ISD::ZERO_EXTEND: case ISD::SIGN_EXTEND: case ISD::ANY_EXTEND: { + SDValue X = LHS->getOperand(0); + + if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 && + isTypeLegal(MVT::v2i16)) { + // Prefer build_vector as the canonical form if packed types are legal. + // (shl ([asz]ext i16:x), 16 -> build_vector 0, x + SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL, + { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) }); + return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); + } + // shl (ext x) => zext (shl x), if shift does not overflow int if (VT != MVT::i64) break; KnownBits Known; - SDValue X = LHS->getOperand(0); DAG.computeKnownBits(X, Known); unsigned LZ = Known.countMinLeadingZeros(); if (LZ < RHSVal) @@ -2678,21 +2923,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0)); return DAG.getZExtOrTrunc(Shl, SL, VT); } - case ISD::OR: - if (!isOrEquivalentToAdd(DAG, LHS)) - break; - LLVM_FALLTHROUGH; - case ISD::ADD: { - // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1) - if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) { - SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0), - SDValue(RHS, 0)); - SDValue C2V = DAG.getConstant(C2->getAPIntValue() << RHSVal, - SDLoc(C2), VT); - return DAG.getNode(LHS->getOpcode(), SL, VT, Shl, C2V); - } - break; - } } if (VT != MVT::i64) @@ -2924,13 +3154,10 @@ static bool isNegativeOne(SDValue Val) { return false; } -static bool isCtlzOpc(unsigned Opc) { - return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; -} - -SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG, +SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG, SDValue Op, - const SDLoc &DL) const { + const SDLoc &DL, + unsigned Opc) const { EVT VT = Op.getValueType(); EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT); if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() && @@ -2940,11 +3167,11 @@ SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG, if (VT != MVT::i32) Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op); - SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op); + SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op); if (VT != MVT::i32) - FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH); + FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX); - return FFBH; + return FFBX; } // The native instructions return -1 on 0 input. Optimize out a select that @@ -2954,7 +3181,7 @@ SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG, // against the bitwidth. // // TODO: Should probably combine against FFBH_U32 instead of ctlz directly. -SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond, +SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const { ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); @@ -2965,20 +3192,25 @@ SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond, ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); SDValue CmpLHS = Cond.getOperand(0); + unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : + AMDGPUISD::FFBH_U32; + // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x + // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x if (CCOpcode == ISD::SETEQ && - isCtlzOpc(RHS.getOpcode()) && + (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) { - return getFFBH_U32(DAG, CmpLHS, SL); + return getFFBX_U32(DAG, CmpLHS, SL, Opc); } // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x + // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x if (CCOpcode == ISD::SETNE && - isCtlzOpc(LHS.getOpcode()) && + (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) { - return getFFBH_U32(DAG, CmpLHS, SL); + return getFFBX_U32(DAG, CmpLHS, SL, Opc); } return SDValue(); @@ -3111,7 +3343,7 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, } // There's no reason to not do this if the condition has other uses. - return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); + return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI); } static bool isConstantFPZero(SDValue N) { @@ -3581,6 +3813,48 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT); } +SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG, + EVT VT, + const SDLoc &SL, + int64_t Offset) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true); + auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset); + SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32); + + return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4, + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); +} + +SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, + const SDLoc &SL, + SDValue Chain, + SDValue StackPtr, + SDValue ArgVal, + int64_t Offset) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset); + + SDValue Ptr = DAG.getObjectPtrOffset(SL, StackPtr, Offset); + SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4, + MachineMemOperand::MODereferenceable); + return Store; +} + +SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG, + const TargetRegisterClass *RC, + EVT VT, const SDLoc &SL, + const ArgDescriptor &Arg) const { + assert(Arg && "Attempting to load missing argument"); + + if (Arg.isRegister()) + return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL); + return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); +} + uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const { unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr(); @@ -3608,6 +3882,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(ELSE) NODE_NAME_CASE(LOOP) NODE_NAME_CASE(CALL) + NODE_NAME_CASE(TC_RETURN) NODE_NAME_CASE(TRAP) NODE_NAME_CASE(RET_FLAG) NODE_NAME_CASE(RETURN_TO_EPILOG) @@ -3655,6 +3930,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BFM) NODE_NAME_CASE(FFBH_U32) NODE_NAME_CASE(FFBH_I32) + NODE_NAME_CASE(FFBL_B32) NODE_NAME_CASE(MUL_U24) NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(MULHI_U24) @@ -3663,6 +3939,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(MUL_LOHI_I24) NODE_NAME_CASE(MAD_U24) NODE_NAME_CASE(MAD_I24) + NODE_NAME_CASE(MAD_I64_I32) + NODE_NAME_CASE(MAD_U64_U32) NODE_NAME_CASE(TEXTURE_FETCH) NODE_NAME_CASE(EXPORT) NODE_NAME_CASE(EXPORT_DONE) @@ -3704,6 +3982,19 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(ATOMIC_DEC) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) + NODE_NAME_CASE(BUFFER_STORE) + NODE_NAME_CASE(BUFFER_STORE_FORMAT) + NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) + NODE_NAME_CASE(BUFFER_ATOMIC_ADD) + NODE_NAME_CASE(BUFFER_ATOMIC_SUB) + NODE_NAME_CASE(BUFFER_ATOMIC_SMIN) + NODE_NAME_CASE(BUFFER_ATOMIC_UMIN) + NODE_NAME_CASE(BUFFER_ATOMIC_SMAX) + NODE_NAME_CASE(BUFFER_ATOMIC_UMAX) + NODE_NAME_CASE(BUFFER_ATOMIC_AND) + NODE_NAME_CASE(BUFFER_ATOMIC_OR) + NODE_NAME_CASE(BUFFER_ATOMIC_XOR) + NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; @@ -3754,7 +4045,6 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( Known.resetAll(); // Don't know anything. - KnownBits Known2; unsigned Opc = Op.getOpcode(); switch (Opc) { @@ -3787,6 +4077,51 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16); break; } + case AMDGPUISD::MUL_U24: + case AMDGPUISD::MUL_I24: { + KnownBits LHSKnown, RHSKnown; + DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1); + DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1); + + unsigned TrailZ = LHSKnown.countMinTrailingZeros() + + RHSKnown.countMinTrailingZeros(); + Known.Zero.setLowBits(std::min(TrailZ, 32u)); + + unsigned LHSValBits = 32 - std::max(LHSKnown.countMinSignBits(), 8u); + unsigned RHSValBits = 32 - std::max(RHSKnown.countMinSignBits(), 8u); + unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u); + if (MaxValBits >= 32) + break; + bool Negative = false; + if (Opc == AMDGPUISD::MUL_I24) { + bool LHSNegative = !!(LHSKnown.One & (1 << 23)); + bool LHSPositive = !!(LHSKnown.Zero & (1 << 23)); + bool RHSNegative = !!(RHSKnown.One & (1 << 23)); + bool RHSPositive = !!(RHSKnown.Zero & (1 << 23)); + if ((!LHSNegative && !LHSPositive) || (!RHSNegative && !RHSPositive)) + break; + Negative = (LHSNegative && RHSPositive) || (LHSPositive && RHSNegative); + } + if (Negative) + Known.One.setHighBits(32 - MaxValBits); + else + Known.Zero.setHighBits(32 - MaxValBits); + break; + } + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + switch (IID) { + case Intrinsic::amdgcn_mbcnt_lo: + case Intrinsic::amdgcn_mbcnt_hi: { + // These return at most the wavefront size - 1. + unsigned Size = Op.getValueType().getSizeInBits(); + Known.Zero.setHighBits(Size - Subtarget->getWavefrontSizeLog2()); + break; + } + default: + break; + } + } } } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index d85aada6053a..3f8a9b1964ca 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -18,13 +18,13 @@ #include "AMDGPU.h" #include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/Target/TargetLowering.h" +#include "llvm/CodeGen/TargetLowering.h" namespace llvm { class AMDGPUMachineFunction; class AMDGPUSubtarget; -class MachineRegisterInfo; +struct ArgDescriptor; class AMDGPUTargetLowering : public TargetLowering { private: @@ -32,10 +32,11 @@ private: /// legalized from a smaller type VT. Need to match pre-legalized type because /// the generic legalization inserts the add/sub between the select and /// compare. - SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const; + SDValue getFFBX_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, unsigned Opc) const; public: - static bool isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op); + static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG); + static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG); protected: const AMDGPUSubtarget *Subtarget; @@ -56,8 +57,10 @@ protected: SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFLOG(SDValue Op, SelectionDAG &Dag, + double Log2BaseInverted) const; - SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const; @@ -88,7 +91,7 @@ protected: SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulLoHi24Combine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, + SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const; SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const; @@ -143,6 +146,7 @@ public: bool isZExtFree(Type *Src, Type *Dest) const override; bool isZExtFree(EVT Src, EVT Dest) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; + bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const override; bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; @@ -171,6 +175,15 @@ public: const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; + + SDValue addTokenForArgument(SDValue Chain, + SelectionDAG &DAG, + MachineFrameInfo &MFI, + int ClobberedFI) const; + + SDValue lowerUnhandledCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals, + StringRef Reason) const; SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const override; @@ -237,6 +250,25 @@ public: return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true); } + /// Similar to CreateLiveInRegister, except value maybe loaded from a stack + /// slot rather than passed in a register. + SDValue loadStackInputValue(SelectionDAG &DAG, + EVT VT, + const SDLoc &SL, + int64_t Offset) const; + + SDValue storeStackInputValue(SelectionDAG &DAG, + const SDLoc &SL, + SDValue Chain, + SDValue StackPtr, + SDValue ArgVal, + int64_t Offset) const; + + SDValue loadInputValue(SelectionDAG &DAG, + const TargetRegisterClass *RC, + EVT VT, const SDLoc &SL, + const ArgDescriptor &Arg) const; + enum ImplicitParameter { FIRST_IMPLICIT, GRID_DIM = FIRST_IMPLICIT, @@ -268,6 +300,7 @@ enum NodeType : unsigned { // Function call. CALL, + TC_RETURN, TRAP, // Masked control flow nodes. @@ -342,12 +375,15 @@ enum NodeType : unsigned { BFM, // Insert a range of bits into a 32-bit word. FFBH_U32, // ctlz with -1 if input is zero. FFBH_I32, + FFBL_B32, // cttz with -1 if input is zero. MUL_U24, MUL_I24, MULHI_U24, MULHI_I24, MAD_U24, MAD_I24, + MAD_U64_U32, + MAD_I64_I32, MUL_LOHI_I24, MUL_LOHI_U24, TEXTURE_FETCH, @@ -411,6 +447,19 @@ enum NodeType : unsigned { ATOMIC_DEC, BUFFER_LOAD, BUFFER_LOAD_FORMAT, + BUFFER_STORE, + BUFFER_STORE_FORMAT, + BUFFER_ATOMIC_SWAP, + BUFFER_ATOMIC_ADD, + BUFFER_ATOMIC_SUB, + BUFFER_ATOMIC_SMIN, + BUFFER_ATOMIC_UMIN, + BUFFER_ATOMIC_SMAX, + BUFFER_ATOMIC_UMAX, + BUFFER_ATOMIC_AND, + BUFFER_ATOMIC_OR, + BUFFER_ATOMIC_XOR, + BUFFER_ATOMIC_CMPSWAP, LAST_AMDGPU_ISD_NUMBER }; diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp new file mode 100644 index 000000000000..ff9e7b50ed5c --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUInline.cpp @@ -0,0 +1,208 @@ +//===- AMDGPUInline.cpp - Code to perform simple function inlining --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This is AMDGPU specific replacement of the standard inliner. +/// The main purpose is to account for the fact that calls not only expensive +/// on the AMDGPU, but much more expensive if a private memory pointer is +/// passed to a function as an argument. In this situation, we are unable to +/// eliminate private memory in the caller unless inlined and end up with slow +/// and expensive scratch access. Thus, we boost the inline threshold for such +/// functions here. +/// +//===----------------------------------------------------------------------===// + + +#include "AMDGPU.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/IPO/Inliner.h" + +using namespace llvm; + +#define DEBUG_TYPE "inline" + +static cl::opt<int> +ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200), + cl::desc("Cost of alloca argument")); + +// If the amount of scratch memory to eliminate exceeds our ability to allocate +// it into registers we gain nothing by agressively inlining functions for that +// heuristic. +static cl::opt<unsigned> +ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), + cl::desc("Maximum alloca size to use for inline cost")); + +namespace { + +class AMDGPUInliner : public LegacyInlinerBase { + +public: + AMDGPUInliner() : LegacyInlinerBase(ID) { + initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry()); + Params = getInlineParams(); + } + + static char ID; // Pass identification, replacement for typeid + + unsigned getInlineThreshold(CallSite CS) const; + + InlineCost getInlineCost(CallSite CS) override; + + bool runOnSCC(CallGraphSCC &SCC) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + TargetTransformInfoWrapperPass *TTIWP; + + InlineParams Params; +}; + +} // end anonymous namespace + +char AMDGPUInliner::ID = 0; +INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline", + "AMDGPU Function Integration/Inlining", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline", + "AMDGPU Function Integration/Inlining", false, false) + +Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); } + +bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) { + TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); + return LegacyInlinerBase::runOnSCC(SCC); +} + +void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetTransformInfoWrapperPass>(); + LegacyInlinerBase::getAnalysisUsage(AU); +} + +unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const { + int Thres = Params.DefaultThreshold; + + Function *Caller = CS.getCaller(); + // Listen to the inlinehint attribute when it would increase the threshold + // and the caller does not need to minimize its size. + Function *Callee = CS.getCalledFunction(); + bool InlineHint = Callee && !Callee->isDeclaration() && + Callee->hasFnAttribute(Attribute::InlineHint); + if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres + && !Caller->hasFnAttribute(Attribute::MinSize)) + Thres = Params.HintThreshold.getValue(); + + const DataLayout &DL = Caller->getParent()->getDataLayout(); + if (!Callee) + return (unsigned)Thres; + + const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*Caller->getParent()); + + // If we have a pointer to private array passed into a function + // it will not be optimized out, leaving scratch usage. + // Increase the inline threshold to allow inliniting in this case. + uint64_t AllocaSize = 0; + SmallPtrSet<const AllocaInst *, 8> AIVisited; + for (Value *PtrArg : CS.args()) { + Type *Ty = PtrArg->getType(); + if (!Ty->isPointerTy() || + Ty->getPointerAddressSpace() != AS.PRIVATE_ADDRESS) + continue; + PtrArg = GetUnderlyingObject(PtrArg, DL); + if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) { + if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) + continue; + AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); + // If the amount of stack memory is excessive we will not be able + // to get rid of the scratch anyway, bail out. + if (AllocaSize > ArgAllocaCutoff) { + AllocaSize = 0; + break; + } + } + } + if (AllocaSize) + Thres += ArgAllocaCost; + + return (unsigned)Thres; +} + +// Check if call is just a wrapper around another call. +// In this case we only have call and ret instructions. +static bool isWrapperOnlyCall(CallSite CS) { + Function *Callee = CS.getCalledFunction(); + if (!Callee || Callee->size() != 1) + return false; + const BasicBlock &BB = Callee->getEntryBlock(); + if (const Instruction *I = BB.getFirstNonPHI()) { + if (!isa<CallInst>(I)) { + return false; + } + if (isa<ReturnInst>(*std::next(I->getIterator()))) { + DEBUG(dbgs() << " Wrapper only call detected: " + << Callee->getName() << '\n'); + return true; + } + } + return false; +} + +InlineCost AMDGPUInliner::getInlineCost(CallSite CS) { + Function *Callee = CS.getCalledFunction(); + Function *Caller = CS.getCaller(); + TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); + + if (!Callee || Callee->isDeclaration() || CS.isNoInline() || + !TTI.areInlineCompatible(Caller, Callee)) + return llvm::InlineCost::getNever(); + + if (CS.hasFnAttr(Attribute::AlwaysInline)) { + if (isInlineViable(*Callee)) + return llvm::InlineCost::getAlways(); + return llvm::InlineCost::getNever(); + } + + if (isWrapperOnlyCall(CS)) + return llvm::InlineCost::getAlways(); + + InlineParams LocalParams = Params; + LocalParams.DefaultThreshold = (int)getInlineThreshold(CS); + bool RemarksEnabled = false; + const auto &BBs = Caller->getBasicBlockList(); + if (!BBs.empty()) { + auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front()); + if (DI.isEnabled()) + RemarksEnabled = true; + } + + OptimizationRemarkEmitter ORE(Caller); + std::function<AssumptionCache &(Function &)> GetAssumptionCache = + [this](Function &F) -> AssumptionCache & { + return ACT->getAssumptionCache(F); + }; + + return llvm::getInlineCost(CS, Callee, LocalParams, TTI, GetAssumptionCache, + None, PSI, RemarksEnabled ? &ORE : nullptr); +} diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index 69dc52986172..8156599528c2 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -23,14 +23,15 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR -#define GET_INSTRMAP_INFO #include "AMDGPUGenInstrInfo.inc" // Pin the vtable to this file. void AMDGPUInstrInfo::anchor() {} AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST) - : AMDGPUGenInstrInfo(-1, -1), ST(ST), AMDGPUASI(ST.getAMDGPUAS()) {} + : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), + ST(ST), + AMDGPUASI(ST.getAMDGPUAS()) {} // FIXME: This behaves strangely. If, for example, you have 32 load + stores, // the first 16 loads will be interleaved with the stores, and the next 16 will @@ -54,34 +55,15 @@ bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, return (NumLoads <= 16 && (Offset1 - Offset0) < 64); } -int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { - switch (Channels) { - default: return Opcode; - case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1); - case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2); - case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3); - } -} - // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td enum SIEncodingFamily { SI = 0, VI = 1, SDWA = 2, - SDWA9 = 3 + SDWA9 = 3, + GFX9 = 4 }; -// Wrapper for Tablegen'd function. enum Subtarget is not defined in any -// header files, so we need to wrap it in a function that takes unsigned -// instead. -namespace llvm { -namespace AMDGPU { -static int getMCOpcode(uint16_t Opcode, unsigned Gen) { - return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen)); -} -} -} - static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) { switch (ST.getGeneration()) { case AMDGPUSubtarget::SOUTHERN_ISLANDS: @@ -104,6 +86,11 @@ static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) { int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { SIEncodingFamily Gen = subtargetEncodingFamily(ST); + + if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && + ST.getGeneration() >= AMDGPUSubtarget::GFX9) + Gen = SIEncodingFamily::GFX9; + if (get(Opcode).TSFlags & SIInstrFlags::SDWA) Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9 : SIEncodingFamily::SDWA; diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h index 41cc7d7093ec..a9fcd4834638 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -18,10 +18,11 @@ #include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #define GET_INSTRINFO_HEADER #include "AMDGPUGenInstrInfo.inc" +#undef GET_INSTRINFO_HEADER namespace llvm { @@ -49,10 +50,6 @@ public: /// Return -1 if the target-specific opcode for the pseudo instruction does /// not exist. If Opcode is not a pseudo instruction, this is identity. int pseudoToMCOpcode(int Opcode) const; - - /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the - /// equivalent opcode that writes \p Channels Channels. - int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const; }; } // End llvm namespace diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index bcf89bb78ad6..c024010f3e96 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -74,6 +74,8 @@ def AMDGPUAddeSubeOp : SDTypeProfile<2, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>] >; +def SDT_AMDGPUTCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>; + //===----------------------------------------------------------------------===// // AMDGPU DAG Nodes // @@ -82,6 +84,26 @@ def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>; def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>; def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>; +def callseq_start : SDNode<"ISD::CALLSEQ_START", + SDCallSeqStart<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>, + [SDNPHasChain, SDNPOutGlue] +>; + +def callseq_end : SDNode<"ISD::CALLSEQ_END", + SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue] +>; + +def AMDGPUcall : SDNode<"AMDGPUISD::CALL", + SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic] +>; + +def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", SDT_AMDGPUTCRET, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] +>; + def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP", SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>, [SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue] @@ -276,6 +298,8 @@ def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>; def AMDGPUffbh_i32 : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>; +def AMDGPUffbl_b32 : SDNode<"AMDGPUISD::FFBL_B32", SDTIntUnaryOp>; + // Signed and unsigned 24-bit multiply. The highest 8-bits are ignore // when performing the mulitply. The result is a 32-bit value. def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index e54c887d6090..16d240e96196 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -402,7 +402,8 @@ bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const { return Ret; } -bool AMDGPUInstructionSelector::select(MachineInstr &I) const { +bool AMDGPUInstructionSelector::select(MachineInstr &I, + CodeGenCoverage &CoverageInfo) const { if (!isPreISelGenericOpcode(I.getOpcode())) return true; diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index ef845f44d365..715c4882f380 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -35,7 +35,8 @@ public: AMDGPUInstructionSelector(const SISubtarget &STI, const AMDGPURegisterBankInfo &RBI); - bool select(MachineInstr &I) const override; + bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override; + private: struct GEPInfo { const MachineInstr &GEP; diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index 4e688ab0b105..31f728b0c22f 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -42,10 +42,14 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm = "", field bits<32> Inst = 0xffffffff; } -def FP16Denormals : Predicate<"Subtarget.hasFP16Denormals()">; -def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">; -def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">; +def FP16Denormals : Predicate<"Subtarget->hasFP16Denormals()">; +def FP32Denormals : Predicate<"Subtarget->hasFP32Denormals()">; +def FP64Denormals : Predicate<"Subtarget->hasFP64Denormals()">; +def NoFP16Denormals : Predicate<"!Subtarget->hasFP16Denormals()">; +def NoFP32Denormals : Predicate<"!Subtarget->hasFP32Denormals()">; +def NoFP64Denormals : Predicate<"!Subtarget->hasFP64Denormals()">; def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; +def FMA : Predicate<"Subtarget->hasFMA()">; def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>; def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>; @@ -130,6 +134,29 @@ def shl_oneuse : HasOneUseBinOp<shl>; def select_oneuse : HasOneUseTernaryOp<select>; +def srl_16 : PatFrag< + (ops node:$src0), (srl_oneuse node:$src0, (i32 16)) +>; + + +def hi_i16_elt : PatFrag< + (ops node:$src0), (i16 (trunc (i32 (srl_16 node:$src0)))) +>; + + +def hi_f16_elt : PatLeaf< + (vt), [{ + if (N->getOpcode() != ISD::BITCAST) + return false; + SDValue Tmp = N->getOperand(0); + + if (Tmp.getOpcode() != ISD::SRL) + return false; + if (const auto *RHS = dyn_cast<ConstantSDNode>(Tmp.getOperand(1)) + return RHS->getZExtValue() == 16; + return false; +}]>; + //===----------------------------------------------------------------------===// // PatLeafs for floating-point comparisons //===----------------------------------------------------------------------===// @@ -164,7 +191,6 @@ def COND_OLE : PatLeaf < [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}] >; - def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>; def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>; @@ -219,75 +245,53 @@ def COND_NULL : PatLeaf < // Load/Store Pattern Fragments //===----------------------------------------------------------------------===// -class PrivateMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS; +class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{ + return cast<MemSDNode>(N)->getAlignment() % 8 == 0; }]>; -class PrivateLoad <SDPatternOperator op> : PrivateMemOp < - (ops node:$ptr), (op node:$ptr) ->; +class LoadFrag <SDPatternOperator op> : PatFrag<(ops node:$ptr), (op node:$ptr)>; -class PrivateStore <SDPatternOperator op> : PrivateMemOp < +class StoreFrag<SDPatternOperator op> : PatFrag < (ops node:$value, node:$ptr), (op node:$value, node:$ptr) >; -def load_private : PrivateLoad <load>; - -def truncstorei8_private : PrivateStore <truncstorei8>; -def truncstorei16_private : PrivateStore <truncstorei16>; -def store_private : PrivateStore <store>; - -class GlobalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ - return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS; -}]>; - -// Global address space loads -class GlobalLoad <SDPatternOperator op> : GlobalMemOp < - (ops node:$ptr), (op node:$ptr) +class StoreHi16<SDPatternOperator op> : PatFrag < + (ops node:$value, node:$ptr), (op (srl node:$value, (i32 16)), node:$ptr) >; -def global_load : GlobalLoad <load>; - -// Global address space stores -class GlobalStore <SDPatternOperator op> : GlobalMemOp < - (ops node:$value, node:$ptr), (op node:$value, node:$ptr) ->; - -def global_store : GlobalStore <store>; -def global_store_atomic : GlobalStore<atomic_store>; - +class PrivateAddress : CodePatPred<[{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS; +}]>; -class ConstantMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ +class ConstantAddress : CodePatPred<[{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS; }]>; -// Constant address space loads -class ConstantLoad <SDPatternOperator op> : ConstantMemOp < - (ops node:$ptr), (op node:$ptr) ->; - -def constant_load : ConstantLoad<load>; - -class LocalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ +class LocalAddress : CodePatPred<[{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; }]>; -// Local address space loads -class LocalLoad <SDPatternOperator op> : LocalMemOp < - (ops node:$ptr), (op node:$ptr) ->; +class GlobalAddress : CodePatPred<[{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS; +}]>; -class LocalStore <SDPatternOperator op> : LocalMemOp < - (ops node:$value, node:$ptr), (op node:$value, node:$ptr) ->; +class GlobalLoadAddress : CodePatPred<[{ + auto AS = cast<MemSDNode>(N)->getAddressSpace(); + return AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.CONSTANT_ADDRESS; +}]>; -class FlatMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ - return cast<MemSDNode>(N)->getAddressSPace() == AMDGPUASI.FLAT_ADDRESS; +class FlatLoadAddress : CodePatPred<[{ + const auto AS = cast<MemSDNode>(N)->getAddressSpace(); + return AS == AMDGPUASI.FLAT_ADDRESS || + AS == AMDGPUASI.GLOBAL_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS; }]>; -class FlatLoad <SDPatternOperator op> : FlatMemOp < - (ops node:$ptr), (op node:$ptr) ->; +class FlatStoreAddress : CodePatPred<[{ + const auto AS = cast<MemSDNode>(N)->getAddressSpace(); + return AS == AMDGPUASI.FLAT_ADDRESS || + AS == AMDGPUASI.GLOBAL_ADDRESS; +}]>; class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr), (ld_node node:$ptr), [{ @@ -302,72 +306,105 @@ def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8; }]>; -def az_extloadi8_global : GlobalLoad <az_extloadi8>; -def sextloadi8_global : GlobalLoad <sextloadi8>; +def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ + return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16; +}]>; -def az_extloadi8_constant : ConstantLoad <az_extloadi8>; -def sextloadi8_constant : ConstantLoad <sextloadi8>; +def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ + return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32; +}]>; -def az_extloadi8_local : LocalLoad <az_extloadi8>; -def sextloadi8_local : LocalLoad <sextloadi8>; +class PrivateLoad <SDPatternOperator op> : LoadFrag <op>, PrivateAddress; +class PrivateStore <SDPatternOperator op> : StoreFrag <op>, PrivateAddress; -def extloadi8_private : PrivateLoad <az_extloadi8>; -def sextloadi8_private : PrivateLoad <sextloadi8>; +class LocalLoad <SDPatternOperator op> : LoadFrag <op>, LocalAddress; +class LocalStore <SDPatternOperator op> : StoreFrag <op>, LocalAddress; -def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ - return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16; -}]>; +class GlobalLoad <SDPatternOperator op> : LoadFrag<op>, GlobalLoadAddress; +class GlobalStore <SDPatternOperator op> : StoreFrag<op>, GlobalAddress; -def az_extloadi16_global : GlobalLoad <az_extloadi16>; -def sextloadi16_global : GlobalLoad <sextloadi16>; +class FlatLoad <SDPatternOperator op> : LoadFrag <op>, FlatLoadAddress; +class FlatStore <SDPatternOperator op> : StoreFrag <op>, FlatStoreAddress; -def az_extloadi16_constant : ConstantLoad <az_extloadi16>; -def sextloadi16_constant : ConstantLoad <sextloadi16>; +class ConstantLoad <SDPatternOperator op> : LoadFrag <op>, ConstantAddress; -def az_extloadi16_local : LocalLoad <az_extloadi16>; -def sextloadi16_local : LocalLoad <sextloadi16>; -def extloadi16_private : PrivateLoad <az_extloadi16>; +def load_private : PrivateLoad <load>; +def az_extloadi8_private : PrivateLoad <az_extloadi8>; +def sextloadi8_private : PrivateLoad <sextloadi8>; +def az_extloadi16_private : PrivateLoad <az_extloadi16>; def sextloadi16_private : PrivateLoad <sextloadi16>; -def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ - return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32; -}]>; - -def az_extloadi32_global : GlobalLoad <az_extloadi32>; +def store_private : PrivateStore <store>; +def truncstorei8_private : PrivateStore<truncstorei8>; +def truncstorei16_private : PrivateStore <truncstorei16>; +def store_hi16_private : StoreHi16 <truncstorei16>, PrivateAddress; +def truncstorei8_hi16_private : StoreHi16<truncstorei8>, PrivateAddress; -def az_extloadi32_flat : FlatLoad <az_extloadi32>; -def az_extloadi32_constant : ConstantLoad <az_extloadi32>; +def load_global : GlobalLoad <load>; +def sextloadi8_global : GlobalLoad <sextloadi8>; +def az_extloadi8_global : GlobalLoad <az_extloadi8>; +def sextloadi16_global : GlobalLoad <sextloadi16>; +def az_extloadi16_global : GlobalLoad <az_extloadi16>; +def atomic_load_global : GlobalLoad<atomic_load>; +def store_global : GlobalStore <store>; def truncstorei8_global : GlobalStore <truncstorei8>; def truncstorei16_global : GlobalStore <truncstorei16>; +def store_atomic_global : GlobalStore<atomic_store>; +def truncstorei8_hi16_global : StoreHi16 <truncstorei8>, GlobalAddress; +def truncstorei16_hi16_global : StoreHi16 <truncstorei16>, GlobalAddress; -def local_store : LocalStore <store>; +def load_local : LocalLoad <load>; +def az_extloadi8_local : LocalLoad <az_extloadi8>; +def sextloadi8_local : LocalLoad <sextloadi8>; +def az_extloadi16_local : LocalLoad <az_extloadi16>; +def sextloadi16_local : LocalLoad <sextloadi16>; + +def store_local : LocalStore <store>; def truncstorei8_local : LocalStore <truncstorei8>; def truncstorei16_local : LocalStore <truncstorei16>; +def store_local_hi16 : StoreHi16 <truncstorei16>, LocalAddress; +def truncstorei8_local_hi16 : StoreHi16<truncstorei8>, LocalAddress; -def local_load : LocalLoad <load>; - -class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{ - return cast<MemSDNode>(N)->getAlignment() % 8 == 0; -}]>; - -def local_load_aligned8bytes : Aligned8Bytes < - (ops node:$ptr), (local_load node:$ptr) +def load_align8_local : Aligned8Bytes < + (ops node:$ptr), (load_local node:$ptr) >; -def local_store_aligned8bytes : Aligned8Bytes < - (ops node:$val, node:$ptr), (local_store node:$val, node:$ptr) +def store_align8_local : Aligned8Bytes < + (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr) >; + +def load_flat : FlatLoad <load>; +def az_extloadi8_flat : FlatLoad <az_extloadi8>; +def sextloadi8_flat : FlatLoad <sextloadi8>; +def az_extloadi16_flat : FlatLoad <az_extloadi16>; +def sextloadi16_flat : FlatLoad <sextloadi16>; +def atomic_load_flat : FlatLoad<atomic_load>; + +def store_flat : FlatStore <store>; +def truncstorei8_flat : FlatStore <truncstorei8>; +def truncstorei16_flat : FlatStore <truncstorei16>; +def atomic_store_flat : FlatStore <atomic_store>; +def truncstorei8_hi16_flat : StoreHi16<truncstorei8>, FlatStoreAddress; +def truncstorei16_hi16_flat : StoreHi16<truncstorei16>, FlatStoreAddress; + + +def constant_load : ConstantLoad<load>; +def sextloadi8_constant : ConstantLoad <sextloadi8>; +def az_extloadi8_constant : ConstantLoad <az_extloadi8>; +def sextloadi16_constant : ConstantLoad <sextloadi16>; +def az_extloadi16_constant : ConstantLoad <az_extloadi16>; + + class local_binary_atomic_op<SDNode atomic_op> : PatFrag<(ops node:$ptr, node:$value), (atomic_op node:$ptr, node:$value), [{ return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; }]>; - def atomic_swap_local : local_binary_atomic_op<atomic_swap>; def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>; def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>; @@ -385,26 +422,14 @@ def mskor_global : PatFrag<(ops node:$val, node:$ptr), return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS; }]>; -multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> { - - def _32_local : PatFrag < - (ops node:$ptr, node:$cmp, node:$swap), - (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ - AtomicSDNode *AN = cast<AtomicSDNode>(N); - return AN->getMemoryVT() == MVT::i32 && - AN->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; - }]>; - - def _64_local : PatFrag< +class AtomicCmpSwapLocal <SDNode cmp_swap_node> : PatFrag< (ops node:$ptr, node:$cmp, node:$swap), (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{ AtomicSDNode *AN = cast<AtomicSDNode>(N); - return AN->getMemoryVT() == MVT::i64 && - AN->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; - }]>; -} + return AN->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; +}]>; -defm atomic_cmp_swap : AtomicCmpSwapLocal <atomic_cmp_swap>; +def atomic_cmp_swap_local : AtomicCmpSwapLocal <atomic_cmp_swap>; multiclass global_binary_atomic_op<SDNode atomic_op> { def "" : PatFrag< @@ -434,26 +459,25 @@ defm atomic_umax_global : global_binary_atomic_op<atomic_load_umax>; defm atomic_umin_global : global_binary_atomic_op<atomic_load_umin>; defm atomic_xor_global : global_binary_atomic_op<atomic_load_xor>; -//legacy +// Legacy. def AMDGPUatomic_cmp_swap_global : PatFrag< - (ops node:$ptr, node:$value), - (AMDGPUatomic_cmp_swap node:$ptr, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>; + (ops node:$ptr, node:$value), + (AMDGPUatomic_cmp_swap node:$ptr, node:$value)>, GlobalAddress; def atomic_cmp_swap_global : PatFrag< - (ops node:$ptr, node:$cmp, node:$value), - (atomic_cmp_swap node:$ptr, node:$cmp, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>; + (ops node:$ptr, node:$cmp, node:$value), + (atomic_cmp_swap node:$ptr, node:$cmp, node:$value)>, GlobalAddress; + def atomic_cmp_swap_global_noret : PatFrag< - (ops node:$ptr, node:$cmp, node:$value), - (atomic_cmp_swap node:$ptr, node:$cmp, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>; + (ops node:$ptr, node:$cmp, node:$value), + (atomic_cmp_swap node:$ptr, node:$cmp, node:$value), + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>; def atomic_cmp_swap_global_ret : PatFrag< - (ops node:$ptr, node:$cmp, node:$value), - (atomic_cmp_swap node:$ptr, node:$cmp, node:$value), - [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>; + (ops node:$ptr, node:$cmp, node:$value), + (atomic_cmp_swap node:$ptr, node:$cmp, node:$value), + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>; //===----------------------------------------------------------------------===// // Misc Pattern Fragments @@ -488,64 +512,11 @@ def FP_HALF : PatLeaf < [{return N->isExactlyValue(0.5);}] >; -let isCodeGenOnly = 1, isPseudo = 1 in { - -let usesCustomInserter = 1 in { - -class CLAMP <RegisterClass rc> : AMDGPUShaderInst < - (outs rc:$dst), - (ins rc:$src0), - "CLAMP $dst, $src0", - [(set f32:$dst, (AMDGPUclamp f32:$src0))] ->; - -class FABS <RegisterClass rc> : AMDGPUShaderInst < - (outs rc:$dst), - (ins rc:$src0), - "FABS $dst, $src0", - [(set f32:$dst, (fabs f32:$src0))] ->; - -class FNEG <RegisterClass rc> : AMDGPUShaderInst < - (outs rc:$dst), - (ins rc:$src0), - "FNEG $dst, $src0", - [(set f32:$dst, (fneg f32:$src0))] ->; - -} // usesCustomInserter = 1 - -multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass, - ComplexPattern addrPat> { -let UseNamedOperandTable = 1 in { - - def RegisterLoad : AMDGPUShaderInst < - (outs dstClass:$dst), - (ins addrClass:$addr, i32imm:$chan), - "RegisterLoad $dst, $addr", - [(set i32:$dst, (AMDGPUregister_load addrPat:$addr, (i32 timm:$chan)))] - > { - let isRegisterLoad = 1; - } - - def RegisterStore : AMDGPUShaderInst < - (outs), - (ins dstClass:$val, addrClass:$addr, i32imm:$chan), - "RegisterStore $val, $addr", - [(AMDGPUregister_store i32:$val, addrPat:$addr, (i32 timm:$chan))] - > { - let isRegisterStore = 1; - } -} -} - -} // End isCodeGenOnly = 1, isPseudo = 1 - /* Generic helper patterns for intrinsics */ /* -------------------------------------- */ class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul> - : Pat < + : AMDGPUPat < (fpow f32:$src0, f32:$src1), (exp_ieee (mul f32:$src1, (log_ieee f32:$src0))) >; @@ -556,30 +527,34 @@ class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul> /* Extract element pattern */ class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx, SubRegIndex sub_reg> - : Pat< + : AMDGPUPat< (sub_type (extractelt vec_type:$src, sub_idx)), (EXTRACT_SUBREG $src, sub_reg) ->; +> { + let SubtargetPredicate = TruePredicate; +} /* Insert element pattern */ class Insert_Element <ValueType elem_type, ValueType vec_type, int sub_idx, SubRegIndex sub_reg> - : Pat < + : AMDGPUPat < (insertelt vec_type:$vec, elem_type:$elem, sub_idx), (INSERT_SUBREG $vec, $elem, sub_reg) ->; +> { + let SubtargetPredicate = TruePredicate; +} // XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer // can handle COPY instructions. // bitconvert pattern -class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : Pat < +class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : AMDGPUPat < (dt (bitconvert (st rc:$src0))), (dt rc:$src0) >; // XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer // can handle COPY instructions. -class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat < +class DwordAddrPat<ValueType vt, RegisterClass rc> : AMDGPUPat < (vt (AMDGPUdwordaddr (vt rc:$addr))), (vt rc:$addr) >; @@ -591,30 +566,30 @@ multiclass BFIPatterns <Instruction BFI_INT, RegisterClass RC64> { // Definition from ISA doc: // (y & x) | (z & ~x) - def : Pat < + def : AMDGPUPat < (or (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), (BFI_INT $x, $y, $z) >; // SHA-256 Ch function // z ^ (x & (y ^ z)) - def : Pat < + def : AMDGPUPat < (xor i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), (BFI_INT $x, $y, $z) >; - def : Pat < + def : AMDGPUPat < (fcopysign f32:$src0, f32:$src1), (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, $src1) >; - def : Pat < + def : AMDGPUPat < (f32 (fcopysign f32:$src0, f64:$src1)), (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, (i32 (EXTRACT_SUBREG $src1, sub1))) >; - def : Pat < + def : AMDGPUPat < (f64 (fcopysign f64:$src0, f64:$src1)), (REG_SEQUENCE RC64, (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, @@ -623,7 +598,7 @@ multiclass BFIPatterns <Instruction BFI_INT, (i32 (EXTRACT_SUBREG $src1, sub1))), sub1) >; - def : Pat < + def : AMDGPUPat < (f64 (fcopysign f64:$src0, f32:$src1)), (REG_SEQUENCE RC64, (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, @@ -636,7 +611,7 @@ multiclass BFIPatterns <Instruction BFI_INT, // SHA-256 Ma patterns // ((x & z) | (y & (x | z))) -> BFI_INT (XOR x, y), z, y -class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat < +class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : AMDGPUPat < (or (and i32:$x, i32:$z), (and i32:$y, (or i32:$x, i32:$z))), (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y) >; @@ -653,24 +628,24 @@ def IMMPopCount : SDNodeXForm<imm, [{ }]>; multiclass BFEPattern <Instruction UBFE, Instruction SBFE, Instruction MOV> { - def : Pat < + def : AMDGPUPat < (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)), (UBFE $src, $rshift, (MOV (i32 (IMMPopCount $mask)))) >; - def : Pat < + def : AMDGPUPat < (srl (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)), (UBFE $src, (i32 0), $width) >; - def : Pat < + def : AMDGPUPat < (sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)), (SBFE $src, (i32 0), $width) >; } // rotr pattern -class ROTRPattern <Instruction BIT_ALIGN> : Pat < +class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat < (rotr i32:$src0, i32:$src1), (BIT_ALIGN $src0, $src0, $src1) >; @@ -681,7 +656,7 @@ class IntMed3Pat<Instruction med3Inst, SDPatternOperator max, SDPatternOperator max_oneuse, SDPatternOperator min_oneuse, - ValueType vt = i32> : Pat< + ValueType vt = i32> : AMDGPUPat< (max (min_oneuse vt:$src0, vt:$src1), (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)), (med3Inst $src0, $src1, $src2) @@ -701,22 +676,24 @@ def cvt_flr_i32_f32 : PatFrag < [{ (void)N; return TM.Options.NoNaNsFPMath; }] >; -class IMad24Pat<Instruction Inst> : Pat < +class IMad24Pat<Instruction Inst, bit HasClamp = 0> : AMDGPUPat < (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2), - (Inst $src0, $src1, $src2) + !if(HasClamp, (Inst $src0, $src1, $src2, (i1 0)), + (Inst $src0, $src1, $src2)) >; -class UMad24Pat<Instruction Inst> : Pat < +class UMad24Pat<Instruction Inst, bit HasClamp = 0> : AMDGPUPat < (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2), - (Inst $src0, $src1, $src2) + !if(HasClamp, (Inst $src0, $src1, $src2, (i1 0)), + (Inst $src0, $src1, $src2)) >; -class RcpPat<Instruction RcpInst, ValueType vt> : Pat < +class RcpPat<Instruction RcpInst, ValueType vt> : AMDGPUPat < (fdiv FP_ONE, vt:$src), (RcpInst $src) >; -class RsqPat<Instruction RsqInst, ValueType vt> : Pat < +class RsqPat<Instruction RsqInst, ValueType vt> : AMDGPUPat < (AMDGPUrcp (fsqrt vt:$src)), (RsqInst $src) >; diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index cc56216c355b..b4704f6feb92 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -13,18 +13,14 @@ //===----------------------------------------------------------------------===// #include "AMDGPULegalizerInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Type.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetOpcodes.h" using namespace llvm; -#ifndef LLVM_BUILD_GLOBAL_ISEL -#error "You shouldn't build this" -#endif - AMDGPULegalizerInfo::AMDGPULegalizerInfo() { using namespace TargetOpcode; @@ -53,6 +49,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() { setAction({G_FCONSTANT, S32}, Legal); + setAction({G_FADD, S32}, Legal); + + setAction({G_FMUL, S32}, Legal); + setAction({G_GEP, P1}, Legal); setAction({G_GEP, P2}, Legal); setAction({G_GEP, 1, S64}, Legal); @@ -66,6 +66,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() { setAction({G_LOAD, 1, P1}, Legal); setAction({G_LOAD, 1, P2}, Legal); + setAction({G_OR, S32}, Legal); + setAction({G_SELECT, S32}, Legal); setAction({G_SELECT, 1, S1}, Legal); diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp new file mode 100644 index 000000000000..f594767c8edb --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -0,0 +1,1770 @@ +//===- AMDGPULibCalls.cpp -------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This file does AMD library function optimizations. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "amdgpu-simplifylib" + +#include "AMDGPU.h" +#include "AMDGPULibFunc.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueSymbolTable.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" +#include <vector> +#include <cmath> + +using namespace llvm; + +static cl::opt<bool> EnablePreLink("amdgpu-prelink", + cl::desc("Enable pre-link mode optimizations"), + cl::init(false), + cl::Hidden); + +static cl::list<std::string> UseNative("amdgpu-use-native", + cl::desc("Comma separated list of functions to replace with native, or all"), + cl::CommaSeparated, cl::ValueOptional, + cl::Hidden); + +#define MATH_PI 3.14159265358979323846264338327950288419716939937511 +#define MATH_E 2.71828182845904523536028747135266249775724709369996 +#define MATH_SQRT2 1.41421356237309504880168872420969807856967187537695 + +#define MATH_LOG2E 1.4426950408889634073599246810018921374266459541529859 +#define MATH_LOG10E 0.4342944819032518276511289189166050822943970058036665 +// Value of log2(10) +#define MATH_LOG2_10 3.3219280948873623478703194294893901758648313930245806 +// Value of 1 / log2(10) +#define MATH_RLOG2_10 0.3010299956639811952137388947244930267681898814621085 +// Value of 1 / M_LOG2E_F = 1 / log2(e) +#define MATH_RLOG2_E 0.6931471805599453094172321214581765680755001343602552 + +namespace llvm { + +class AMDGPULibCalls { +private: + + typedef llvm::AMDGPULibFunc FuncInfo; + + // -fuse-native. + bool AllNative = false; + + bool useNativeFunc(const StringRef F) const; + + // Return a pointer (pointer expr) to the function if function defintion with + // "FuncName" exists. It may create a new function prototype in pre-link mode. + Constant *getFunction(Module *M, const FuncInfo& fInfo); + + // Replace a normal function with its native version. + bool replaceWithNative(CallInst *CI, const FuncInfo &FInfo); + + bool parseFunctionName(const StringRef& FMangledName, + FuncInfo *FInfo=nullptr /*out*/); + + bool TDOFold(CallInst *CI, const FuncInfo &FInfo); + + /* Specialized optimizations */ + + // recip (half or native) + bool fold_recip(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); + + // divide (half or native) + bool fold_divide(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); + + // pow/powr/pown + bool fold_pow(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); + + // rootn + bool fold_rootn(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); + + // fma/mad + bool fold_fma_mad(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); + + // -fuse-native for sincos + bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo); + + // evaluate calls if calls' arguments are constants. + bool evaluateScalarMathFunc(FuncInfo &FInfo, double& Res0, + double& Res1, Constant *copr0, Constant *copr1, Constant *copr2); + bool evaluateCall(CallInst *aCI, FuncInfo &FInfo); + + // exp + bool fold_exp(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); + + // exp2 + bool fold_exp2(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); + + // exp10 + bool fold_exp10(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); + + // log + bool fold_log(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); + + // log2 + bool fold_log2(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); + + // log10 + bool fold_log10(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); + + // sqrt + bool fold_sqrt(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo); + + // sin/cos + bool fold_sincos(CallInst * CI, IRBuilder<> &B, AliasAnalysis * AA); + + // __read_pipe/__write_pipe + bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, FuncInfo &FInfo); + + // Get insertion point at entry. + BasicBlock::iterator getEntryIns(CallInst * UI); + // Insert an Alloc instruction. + AllocaInst* insertAlloca(CallInst * UI, IRBuilder<> &B, const char *prefix); + // Get a scalar native builtin signle argument FP function + Constant* getNativeFunction(Module* M, const FuncInfo &FInfo); + +protected: + CallInst *CI; + + bool isUnsafeMath(const CallInst *CI) const; + + void replaceCall(Value *With) { + CI->replaceAllUsesWith(With); + CI->eraseFromParent(); + } + +public: + bool fold(CallInst *CI, AliasAnalysis *AA = nullptr); + + void initNativeFuncs(); + + // Replace a normal math function call with that native version + bool useNative(CallInst *CI); +}; + +} // end llvm namespace + +namespace { + + class AMDGPUSimplifyLibCalls : public FunctionPass { + + AMDGPULibCalls Simplifier; + + const TargetOptions Options; + + public: + static char ID; // Pass identification + + AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions()) + : FunctionPass(ID), Options(Opt) { + initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AAResultsWrapperPass>(); + } + + bool runOnFunction(Function &M) override; + }; + + class AMDGPUUseNativeCalls : public FunctionPass { + + AMDGPULibCalls Simplifier; + + public: + static char ID; // Pass identification + + AMDGPUUseNativeCalls() : FunctionPass(ID) { + initializeAMDGPUUseNativeCallsPass(*PassRegistry::getPassRegistry()); + Simplifier.initNativeFuncs(); + } + + bool runOnFunction(Function &F) override; + }; + +} // end anonymous namespace. + +char AMDGPUSimplifyLibCalls::ID = 0; +char AMDGPUUseNativeCalls::ID = 0; + +INITIALIZE_PASS_BEGIN(AMDGPUSimplifyLibCalls, "amdgpu-simplifylib", + "Simplify well-known AMD library calls", false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_END(AMDGPUSimplifyLibCalls, "amdgpu-simplifylib", + "Simplify well-known AMD library calls", false, false) + +INITIALIZE_PASS(AMDGPUUseNativeCalls, "amdgpu-usenative", + "Replace builtin math calls with that native versions.", + false, false) + +template <typename IRB> +static CallInst *CreateCallEx(IRB &B, Value *Callee, Value *Arg, + const Twine &Name = "") { + CallInst *R = B.CreateCall(Callee, Arg, Name); + if (Function* F = dyn_cast<Function>(Callee)) + R->setCallingConv(F->getCallingConv()); + return R; +} + +template <typename IRB> +static CallInst *CreateCallEx2(IRB &B, Value *Callee, Value *Arg1, Value *Arg2, + const Twine &Name = "") { + CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name); + if (Function* F = dyn_cast<Function>(Callee)) + R->setCallingConv(F->getCallingConv()); + return R; +} + +// Data structures for table-driven optimizations. +// FuncTbl works for both f32 and f64 functions with 1 input argument + +struct TableEntry { + double result; + double input; +}; + +/* a list of {result, input} */ +static const TableEntry tbl_acos[] = { + {MATH_PI/2.0, 0.0}, + {MATH_PI/2.0, -0.0}, + {0.0, 1.0}, + {MATH_PI, -1.0} +}; +static const TableEntry tbl_acosh[] = { + {0.0, 1.0} +}; +static const TableEntry tbl_acospi[] = { + {0.5, 0.0}, + {0.5, -0.0}, + {0.0, 1.0}, + {1.0, -1.0} +}; +static const TableEntry tbl_asin[] = { + {0.0, 0.0}, + {-0.0, -0.0}, + {MATH_PI/2.0, 1.0}, + {-MATH_PI/2.0, -1.0} +}; +static const TableEntry tbl_asinh[] = { + {0.0, 0.0}, + {-0.0, -0.0} +}; +static const TableEntry tbl_asinpi[] = { + {0.0, 0.0}, + {-0.0, -0.0}, + {0.5, 1.0}, + {-0.5, -1.0} +}; +static const TableEntry tbl_atan[] = { + {0.0, 0.0}, + {-0.0, -0.0}, + {MATH_PI/4.0, 1.0}, + {-MATH_PI/4.0, -1.0} +}; +static const TableEntry tbl_atanh[] = { + {0.0, 0.0}, + {-0.0, -0.0} +}; +static const TableEntry tbl_atanpi[] = { + {0.0, 0.0}, + {-0.0, -0.0}, + {0.25, 1.0}, + {-0.25, -1.0} +}; +static const TableEntry tbl_cbrt[] = { + {0.0, 0.0}, + {-0.0, -0.0}, + {1.0, 1.0}, + {-1.0, -1.0}, +}; +static const TableEntry tbl_cos[] = { + {1.0, 0.0}, + {1.0, -0.0} +}; +static const TableEntry tbl_cosh[] = { + {1.0, 0.0}, + {1.0, -0.0} +}; +static const TableEntry tbl_cospi[] = { + {1.0, 0.0}, + {1.0, -0.0} +}; +static const TableEntry tbl_erfc[] = { + {1.0, 0.0}, + {1.0, -0.0} +}; +static const TableEntry tbl_erf[] = { + {0.0, 0.0}, + {-0.0, -0.0} +}; +static const TableEntry tbl_exp[] = { + {1.0, 0.0}, + {1.0, -0.0}, + {MATH_E, 1.0} +}; +static const TableEntry tbl_exp2[] = { + {1.0, 0.0}, + {1.0, -0.0}, + {2.0, 1.0} +}; +static const TableEntry tbl_exp10[] = { + {1.0, 0.0}, + {1.0, -0.0}, + {10.0, 1.0} +}; +static const TableEntry tbl_expm1[] = { + {0.0, 0.0}, + {-0.0, -0.0} +}; +static const TableEntry tbl_log[] = { + {0.0, 1.0}, + {1.0, MATH_E} +}; +static const TableEntry tbl_log2[] = { + {0.0, 1.0}, + {1.0, 2.0} +}; +static const TableEntry tbl_log10[] = { + {0.0, 1.0}, + {1.0, 10.0} +}; +static const TableEntry tbl_rsqrt[] = { + {1.0, 1.0}, + {1.0/MATH_SQRT2, 2.0} +}; +static const TableEntry tbl_sin[] = { + {0.0, 0.0}, + {-0.0, -0.0} +}; +static const TableEntry tbl_sinh[] = { + {0.0, 0.0}, + {-0.0, -0.0} +}; +static const TableEntry tbl_sinpi[] = { + {0.0, 0.0}, + {-0.0, -0.0} +}; +static const TableEntry tbl_sqrt[] = { + {0.0, 0.0}, + {1.0, 1.0}, + {MATH_SQRT2, 2.0} +}; +static const TableEntry tbl_tan[] = { + {0.0, 0.0}, + {-0.0, -0.0} +}; +static const TableEntry tbl_tanh[] = { + {0.0, 0.0}, + {-0.0, -0.0} +}; +static const TableEntry tbl_tanpi[] = { + {0.0, 0.0}, + {-0.0, -0.0} +}; +static const TableEntry tbl_tgamma[] = { + {1.0, 1.0}, + {1.0, 2.0}, + {2.0, 3.0}, + {6.0, 4.0} +}; + +static bool HasNative(AMDGPULibFunc::EFuncId id) { + switch(id) { + case AMDGPULibFunc::EI_DIVIDE: + case AMDGPULibFunc::EI_COS: + case AMDGPULibFunc::EI_EXP: + case AMDGPULibFunc::EI_EXP2: + case AMDGPULibFunc::EI_EXP10: + case AMDGPULibFunc::EI_LOG: + case AMDGPULibFunc::EI_LOG2: + case AMDGPULibFunc::EI_LOG10: + case AMDGPULibFunc::EI_POWR: + case AMDGPULibFunc::EI_RECIP: + case AMDGPULibFunc::EI_RSQRT: + case AMDGPULibFunc::EI_SIN: + case AMDGPULibFunc::EI_SINCOS: + case AMDGPULibFunc::EI_SQRT: + case AMDGPULibFunc::EI_TAN: + return true; + default:; + } + return false; +} + +struct TableRef { + size_t size; + const TableEntry *table; // variable size: from 0 to (size - 1) + + TableRef() : size(0), table(nullptr) {} + + template <size_t N> + TableRef(const TableEntry (&tbl)[N]) : size(N), table(&tbl[0]) {} +}; + +static TableRef getOptTable(AMDGPULibFunc::EFuncId id) { + switch(id) { + case AMDGPULibFunc::EI_ACOS: return TableRef(tbl_acos); + case AMDGPULibFunc::EI_ACOSH: return TableRef(tbl_acosh); + case AMDGPULibFunc::EI_ACOSPI: return TableRef(tbl_acospi); + case AMDGPULibFunc::EI_ASIN: return TableRef(tbl_asin); + case AMDGPULibFunc::EI_ASINH: return TableRef(tbl_asinh); + case AMDGPULibFunc::EI_ASINPI: return TableRef(tbl_asinpi); + case AMDGPULibFunc::EI_ATAN: return TableRef(tbl_atan); + case AMDGPULibFunc::EI_ATANH: return TableRef(tbl_atanh); + case AMDGPULibFunc::EI_ATANPI: return TableRef(tbl_atanpi); + case AMDGPULibFunc::EI_CBRT: return TableRef(tbl_cbrt); + case AMDGPULibFunc::EI_NCOS: + case AMDGPULibFunc::EI_COS: return TableRef(tbl_cos); + case AMDGPULibFunc::EI_COSH: return TableRef(tbl_cosh); + case AMDGPULibFunc::EI_COSPI: return TableRef(tbl_cospi); + case AMDGPULibFunc::EI_ERFC: return TableRef(tbl_erfc); + case AMDGPULibFunc::EI_ERF: return TableRef(tbl_erf); + case AMDGPULibFunc::EI_EXP: return TableRef(tbl_exp); + case AMDGPULibFunc::EI_NEXP2: + case AMDGPULibFunc::EI_EXP2: return TableRef(tbl_exp2); + case AMDGPULibFunc::EI_EXP10: return TableRef(tbl_exp10); + case AMDGPULibFunc::EI_EXPM1: return TableRef(tbl_expm1); + case AMDGPULibFunc::EI_LOG: return TableRef(tbl_log); + case AMDGPULibFunc::EI_NLOG2: + case AMDGPULibFunc::EI_LOG2: return TableRef(tbl_log2); + case AMDGPULibFunc::EI_LOG10: return TableRef(tbl_log10); + case AMDGPULibFunc::EI_NRSQRT: + case AMDGPULibFunc::EI_RSQRT: return TableRef(tbl_rsqrt); + case AMDGPULibFunc::EI_NSIN: + case AMDGPULibFunc::EI_SIN: return TableRef(tbl_sin); + case AMDGPULibFunc::EI_SINH: return TableRef(tbl_sinh); + case AMDGPULibFunc::EI_SINPI: return TableRef(tbl_sinpi); + case AMDGPULibFunc::EI_NSQRT: + case AMDGPULibFunc::EI_SQRT: return TableRef(tbl_sqrt); + case AMDGPULibFunc::EI_TAN: return TableRef(tbl_tan); + case AMDGPULibFunc::EI_TANH: return TableRef(tbl_tanh); + case AMDGPULibFunc::EI_TANPI: return TableRef(tbl_tanpi); + case AMDGPULibFunc::EI_TGAMMA: return TableRef(tbl_tgamma); + default:; + } + return TableRef(); +} + +static inline int getVecSize(const AMDGPULibFunc& FInfo) { + return FInfo.getLeads()[0].VectorSize; +} + +static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) { + return (AMDGPULibFunc::EType)FInfo.getLeads()[0].ArgType; +} + +Constant *AMDGPULibCalls::getFunction(Module *M, const FuncInfo& fInfo) { + // If we are doing PreLinkOpt, the function is external. So it is safe to + // use getOrInsertFunction() at this stage. + + return EnablePreLink ? AMDGPULibFunc::getOrInsertFunction(M, fInfo) + : AMDGPULibFunc::getFunction(M, fInfo); +} + +bool AMDGPULibCalls::parseFunctionName(const StringRef& FMangledName, + FuncInfo *FInfo) { + return AMDGPULibFunc::parse(FMangledName, *FInfo); +} + +bool AMDGPULibCalls::isUnsafeMath(const CallInst *CI) const { + if (auto Op = dyn_cast<FPMathOperator>(CI)) + if (Op->isFast()) + return true; + const Function *F = CI->getParent()->getParent(); + Attribute Attr = F->getFnAttribute("unsafe-fp-math"); + return Attr.getValueAsString() == "true"; +} + +bool AMDGPULibCalls::useNativeFunc(const StringRef F) const { + return AllNative || + std::find(UseNative.begin(), UseNative.end(), F) != UseNative.end(); +} + +void AMDGPULibCalls::initNativeFuncs() { + AllNative = useNativeFunc("all") || + (UseNative.getNumOccurrences() && UseNative.size() == 1 && + UseNative.begin()->empty()); +} + +bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) { + bool native_sin = useNativeFunc("sin"); + bool native_cos = useNativeFunc("cos"); + + if (native_sin && native_cos) { + Module *M = aCI->getModule(); + Value *opr0 = aCI->getArgOperand(0); + + AMDGPULibFunc nf; + nf.getLeads()[0].ArgType = FInfo.getLeads()[0].ArgType; + nf.getLeads()[0].VectorSize = FInfo.getLeads()[0].VectorSize; + + nf.setPrefix(AMDGPULibFunc::NATIVE); + nf.setId(AMDGPULibFunc::EI_SIN); + Constant *sinExpr = getFunction(M, nf); + + nf.setPrefix(AMDGPULibFunc::NATIVE); + nf.setId(AMDGPULibFunc::EI_COS); + Constant *cosExpr = getFunction(M, nf); + if (sinExpr && cosExpr) { + Value *sinval = CallInst::Create(sinExpr, opr0, "splitsin", aCI); + Value *cosval = CallInst::Create(cosExpr, opr0, "splitcos", aCI); + new StoreInst(cosval, aCI->getArgOperand(1), aCI); + + DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI + << " with native version of sin/cos"); + + replaceCall(sinval); + return true; + } + } + return false; +} + +bool AMDGPULibCalls::useNative(CallInst *aCI) { + CI = aCI; + Function *Callee = aCI->getCalledFunction(); + + FuncInfo FInfo; + if (!parseFunctionName(Callee->getName(), &FInfo) || !FInfo.isMangled() || + FInfo.getPrefix() != AMDGPULibFunc::NOPFX || + getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId()) || + !(AllNative || useNativeFunc(FInfo.getName()))) { + return false; + } + + if (FInfo.getId() == AMDGPULibFunc::EI_SINCOS) + return sincosUseNative(aCI, FInfo); + + FInfo.setPrefix(AMDGPULibFunc::NATIVE); + Constant *F = getFunction(aCI->getModule(), FInfo); + if (!F) + return false; + + aCI->setCalledFunction(F); + DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI + << " with native version"); + return true; +} + +// Clang emits call of __read_pipe_2 or __read_pipe_4 for OpenCL read_pipe +// builtin, with appended type size and alignment arguments, where 2 or 4 +// indicates the original number of arguments. The library has optimized version +// of __read_pipe_2/__read_pipe_4 when the type size and alignment has the same +// power of 2 value. This function transforms __read_pipe_2 to __read_pipe_2_N +// for such cases where N is the size in bytes of the type (N = 1, 2, 4, 8, ..., +// 128). The same for __read_pipe_4, write_pipe_2, and write_pipe_4. +bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, + FuncInfo &FInfo) { + auto *Callee = CI->getCalledFunction(); + if (!Callee->isDeclaration()) + return false; + + assert(Callee->hasName() && "Invalid read_pipe/write_pipe function"); + auto *M = Callee->getParent(); + auto &Ctx = M->getContext(); + std::string Name = Callee->getName(); + auto NumArg = CI->getNumArgOperands(); + if (NumArg != 4 && NumArg != 6) + return false; + auto *PacketSize = CI->getArgOperand(NumArg - 2); + auto *PacketAlign = CI->getArgOperand(NumArg - 1); + if (!isa<ConstantInt>(PacketSize) || !isa<ConstantInt>(PacketAlign)) + return false; + unsigned Size = cast<ConstantInt>(PacketSize)->getZExtValue(); + unsigned Align = cast<ConstantInt>(PacketAlign)->getZExtValue(); + if (Size != Align || !isPowerOf2_32(Size)) + return false; + + Type *PtrElemTy; + if (Size <= 8) + PtrElemTy = Type::getIntNTy(Ctx, Size * 8); + else + PtrElemTy = VectorType::get(Type::getInt64Ty(Ctx), Size / 8); + unsigned PtrArgLoc = CI->getNumArgOperands() - 3; + auto PtrArg = CI->getArgOperand(PtrArgLoc); + unsigned PtrArgAS = PtrArg->getType()->getPointerAddressSpace(); + auto *PtrTy = llvm::PointerType::get(PtrElemTy, PtrArgAS); + + SmallVector<llvm::Type *, 6> ArgTys; + for (unsigned I = 0; I != PtrArgLoc; ++I) + ArgTys.push_back(CI->getArgOperand(I)->getType()); + ArgTys.push_back(PtrTy); + + Name = Name + "_" + std::to_string(Size); + auto *FTy = FunctionType::get(Callee->getReturnType(), + ArrayRef<Type *>(ArgTys), false); + AMDGPULibFunc NewLibFunc(Name, FTy); + auto *F = AMDGPULibFunc::getOrInsertFunction(M, NewLibFunc); + if (!F) + return false; + + auto *BCast = B.CreatePointerCast(PtrArg, PtrTy); + SmallVector<Value *, 6> Args; + for (unsigned I = 0; I != PtrArgLoc; ++I) + Args.push_back(CI->getArgOperand(I)); + Args.push_back(BCast); + + auto *NCI = B.CreateCall(F, Args); + NCI->setAttributes(CI->getAttributes()); + CI->replaceAllUsesWith(NCI); + CI->dropAllReferences(); + CI->eraseFromParent(); + + return true; +} + +// This function returns false if no change; return true otherwise. +bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) { + this->CI = CI; + Function *Callee = CI->getCalledFunction(); + + // Ignore indirect calls. + if (Callee == 0) return false; + + FuncInfo FInfo; + if (!parseFunctionName(Callee->getName(), &FInfo)) + return false; + + // Further check the number of arguments to see if they match. + if (CI->getNumArgOperands() != FInfo.getNumArgs()) + return false; + + BasicBlock *BB = CI->getParent(); + LLVMContext &Context = CI->getParent()->getContext(); + IRBuilder<> B(Context); + + // Set the builder to the instruction after the call. + B.SetInsertPoint(BB, CI->getIterator()); + + // Copy fast flags from the original call. + if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(CI)) + B.setFastMathFlags(FPOp->getFastMathFlags()); + + if (TDOFold(CI, FInfo)) + return true; + + // Under unsafe-math, evaluate calls if possible. + // According to Brian Sumner, we can do this for all f32 function calls + // using host's double function calls. + if (isUnsafeMath(CI) && evaluateCall(CI, FInfo)) + return true; + + // Specilized optimizations for each function call + switch (FInfo.getId()) { + case AMDGPULibFunc::EI_RECIP: + // skip vector function + assert ((FInfo.getPrefix() == AMDGPULibFunc::NATIVE || + FInfo.getPrefix() == AMDGPULibFunc::HALF) && + "recip must be an either native or half function"); + return (getVecSize(FInfo) != 1) ? false : fold_recip(CI, B, FInfo); + + case AMDGPULibFunc::EI_DIVIDE: + // skip vector function + assert ((FInfo.getPrefix() == AMDGPULibFunc::NATIVE || + FInfo.getPrefix() == AMDGPULibFunc::HALF) && + "divide must be an either native or half function"); + return (getVecSize(FInfo) != 1) ? false : fold_divide(CI, B, FInfo); + + case AMDGPULibFunc::EI_POW: + case AMDGPULibFunc::EI_POWR: + case AMDGPULibFunc::EI_POWN: + return fold_pow(CI, B, FInfo); + + case AMDGPULibFunc::EI_ROOTN: + // skip vector function + return (getVecSize(FInfo) != 1) ? false : fold_rootn(CI, B, FInfo); + + case AMDGPULibFunc::EI_FMA: + case AMDGPULibFunc::EI_MAD: + case AMDGPULibFunc::EI_NFMA: + // skip vector function + return (getVecSize(FInfo) != 1) ? false : fold_fma_mad(CI, B, FInfo); + + case AMDGPULibFunc::EI_SQRT: + return isUnsafeMath(CI) && fold_sqrt(CI, B, FInfo); + case AMDGPULibFunc::EI_COS: + case AMDGPULibFunc::EI_SIN: + if ((getArgType(FInfo) == AMDGPULibFunc::F32 || + getArgType(FInfo) == AMDGPULibFunc::F64) + && (FInfo.getPrefix() == AMDGPULibFunc::NOPFX)) + return fold_sincos(CI, B, AA); + + break; + case AMDGPULibFunc::EI_READ_PIPE_2: + case AMDGPULibFunc::EI_READ_PIPE_4: + case AMDGPULibFunc::EI_WRITE_PIPE_2: + case AMDGPULibFunc::EI_WRITE_PIPE_4: + return fold_read_write_pipe(CI, B, FInfo); + + default: + break; + } + + return false; +} + +bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) { + // Table-Driven optimization + const TableRef tr = getOptTable(FInfo.getId()); + if (tr.size==0) + return false; + + int const sz = (int)tr.size; + const TableEntry * const ftbl = tr.table; + Value *opr0 = CI->getArgOperand(0); + + if (getVecSize(FInfo) > 1) { + if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(opr0)) { + SmallVector<double, 0> DVal; + for (int eltNo = 0; eltNo < getVecSize(FInfo); ++eltNo) { + ConstantFP *eltval = dyn_cast<ConstantFP>( + CV->getElementAsConstant((unsigned)eltNo)); + assert(eltval && "Non-FP arguments in math function!"); + bool found = false; + for (int i=0; i < sz; ++i) { + if (eltval->isExactlyValue(ftbl[i].input)) { + DVal.push_back(ftbl[i].result); + found = true; + break; + } + } + if (!found) { + // This vector constants not handled yet. + return false; + } + } + LLVMContext &context = CI->getParent()->getParent()->getContext(); + Constant *nval; + if (getArgType(FInfo) == AMDGPULibFunc::F32) { + SmallVector<float, 0> FVal; + for (unsigned i = 0; i < DVal.size(); ++i) { + FVal.push_back((float)DVal[i]); + } + ArrayRef<float> tmp(FVal); + nval = ConstantDataVector::get(context, tmp); + } else { // F64 + ArrayRef<double> tmp(DVal); + nval = ConstantDataVector::get(context, tmp); + } + DEBUG(errs() << "AMDIC: " << *CI + << " ---> " << *nval << "\n"); + replaceCall(nval); + return true; + } + } else { + // Scalar version + if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) { + for (int i = 0; i < sz; ++i) { + if (CF->isExactlyValue(ftbl[i].input)) { + Value *nval = ConstantFP::get(CF->getType(), ftbl[i].result); + DEBUG(errs() << "AMDIC: " << *CI + << " ---> " << *nval << "\n"); + replaceCall(nval); + return true; + } + } + } + } + + return false; +} + +bool AMDGPULibCalls::replaceWithNative(CallInst *CI, const FuncInfo &FInfo) { + Module *M = CI->getModule(); + if (getArgType(FInfo) != AMDGPULibFunc::F32 || + FInfo.getPrefix() != AMDGPULibFunc::NOPFX || + !HasNative(FInfo.getId())) + return false; + + AMDGPULibFunc nf = FInfo; + nf.setPrefix(AMDGPULibFunc::NATIVE); + if (Constant *FPExpr = getFunction(M, nf)) { + DEBUG(dbgs() << "AMDIC: " << *CI << " ---> "); + + CI->setCalledFunction(FPExpr); + + DEBUG(dbgs() << *CI << '\n'); + + return true; + } + return false; +} + +// [native_]half_recip(c) ==> 1.0/c +bool AMDGPULibCalls::fold_recip(CallInst *CI, IRBuilder<> &B, + const FuncInfo &FInfo) { + Value *opr0 = CI->getArgOperand(0); + if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) { + // Just create a normal div. Later, InstCombine will be able + // to compute the divide into a constant (avoid check float infinity + // or subnormal at this point). + Value *nval = B.CreateFDiv(ConstantFP::get(CF->getType(), 1.0), + opr0, + "recip2div"); + DEBUG(errs() << "AMDIC: " << *CI + << " ---> " << *nval << "\n"); + replaceCall(nval); + return true; + } + return false; +} + +// [native_]half_divide(x, c) ==> x/c +bool AMDGPULibCalls::fold_divide(CallInst *CI, IRBuilder<> &B, + const FuncInfo &FInfo) { + Value *opr0 = CI->getArgOperand(0); + Value *opr1 = CI->getArgOperand(1); + ConstantFP *CF0 = dyn_cast<ConstantFP>(opr0); + ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1); + + if ((CF0 && CF1) || // both are constants + (CF1 && (getArgType(FInfo) == AMDGPULibFunc::F32))) + // CF1 is constant && f32 divide + { + Value *nval1 = B.CreateFDiv(ConstantFP::get(opr1->getType(), 1.0), + opr1, "__div2recip"); + Value *nval = B.CreateFMul(opr0, nval1, "__div2mul"); + replaceCall(nval); + return true; + } + return false; +} + +namespace llvm { +static double log2(double V) { +#if _XOPEN_SOURCE >= 600 || _ISOC99_SOURCE || _POSIX_C_SOURCE >= 200112L + return ::log2(V); +#else + return log(V) / 0.693147180559945309417; +#endif +} +} + +bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B, + const FuncInfo &FInfo) { + assert((FInfo.getId() == AMDGPULibFunc::EI_POW || + FInfo.getId() == AMDGPULibFunc::EI_POWR || + FInfo.getId() == AMDGPULibFunc::EI_POWN) && + "fold_pow: encounter a wrong function call"); + + Value *opr0, *opr1; + ConstantFP *CF; + ConstantInt *CINT; + ConstantAggregateZero *CZero; + Type *eltType; + + opr0 = CI->getArgOperand(0); + opr1 = CI->getArgOperand(1); + CZero = dyn_cast<ConstantAggregateZero>(opr1); + if (getVecSize(FInfo) == 1) { + eltType = opr0->getType(); + CF = dyn_cast<ConstantFP>(opr1); + CINT = dyn_cast<ConstantInt>(opr1); + } else { + VectorType *VTy = dyn_cast<VectorType>(opr0->getType()); + assert(VTy && "Oprand of vector function should be of vectortype"); + eltType = VTy->getElementType(); + ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr1); + + // Now, only Handle vector const whose elements have the same value. + CF = CDV ? dyn_cast_or_null<ConstantFP>(CDV->getSplatValue()) : nullptr; + CINT = CDV ? dyn_cast_or_null<ConstantInt>(CDV->getSplatValue()) : nullptr; + } + + // No unsafe math , no constant argument, do nothing + if (!isUnsafeMath(CI) && !CF && !CINT && !CZero) + return false; + + // 0x1111111 means that we don't do anything for this call. + int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : 0x1111111); + + if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0) || CZero) { + // pow/powr/pown(x, 0) == 1 + DEBUG(errs() << "AMDIC: " << *CI << " ---> 1\n"); + Constant *cnval = ConstantFP::get(eltType, 1.0); + if (getVecSize(FInfo) > 1) { + cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval); + } + replaceCall(cnval); + return true; + } + if ((CF && CF->isExactlyValue(1.0)) || (CINT && ci_opr1 == 1)) { + // pow/powr/pown(x, 1.0) = x + DEBUG(errs() << "AMDIC: " << *CI + << " ---> " << *opr0 << "\n"); + replaceCall(opr0); + return true; + } + if ((CF && CF->isExactlyValue(2.0)) || (CINT && ci_opr1 == 2)) { + // pow/powr/pown(x, 2.0) = x*x + DEBUG(errs() << "AMDIC: " << *CI + << " ---> " << *opr0 << " * " << *opr0 << "\n"); + Value *nval = B.CreateFMul(opr0, opr0, "__pow2"); + replaceCall(nval); + return true; + } + if ((CF && CF->isExactlyValue(-1.0)) || (CINT && ci_opr1 == -1)) { + // pow/powr/pown(x, -1.0) = 1.0/x + DEBUG(errs() << "AMDIC: " << *CI + << " ---> 1 / " << *opr0 << "\n"); + Constant *cnval = ConstantFP::get(eltType, 1.0); + if (getVecSize(FInfo) > 1) { + cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval); + } + Value *nval = B.CreateFDiv(cnval, opr0, "__powrecip"); + replaceCall(nval); + return true; + } + + Module *M = CI->getModule(); + if (CF && (CF->isExactlyValue(0.5) || CF->isExactlyValue(-0.5))) { + // pow[r](x, [-]0.5) = sqrt(x) + bool issqrt = CF->isExactlyValue(0.5); + if (Constant *FPExpr = getFunction(M, + AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT + : AMDGPULibFunc::EI_RSQRT, FInfo))) { + DEBUG(errs() << "AMDIC: " << *CI << " ---> " + << FInfo.getName().c_str() << "(" << *opr0 << ")\n"); + Value *nval = CreateCallEx(B,FPExpr, opr0, issqrt ? "__pow2sqrt" + : "__pow2rsqrt"); + replaceCall(nval); + return true; + } + } + + if (!isUnsafeMath(CI)) + return false; + + // Unsafe Math optimization + + // Remember that ci_opr1 is set if opr1 is integral + if (CF) { + double dval = (getArgType(FInfo) == AMDGPULibFunc::F32) + ? (double)CF->getValueAPF().convertToFloat() + : CF->getValueAPF().convertToDouble(); + int ival = (int)dval; + if ((double)ival == dval) { + ci_opr1 = ival; + } else + ci_opr1 = 0x11111111; + } + + // pow/powr/pown(x, c) = [1/](x*x*..x); where + // trunc(c) == c && the number of x == c && |c| <= 12 + unsigned abs_opr1 = (ci_opr1 < 0) ? -ci_opr1 : ci_opr1; + if (abs_opr1 <= 12) { + Constant *cnval; + Value *nval; + if (abs_opr1 == 0) { + cnval = ConstantFP::get(eltType, 1.0); + if (getVecSize(FInfo) > 1) { + cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval); + } + nval = cnval; + } else { + Value *valx2 = nullptr; + nval = nullptr; + while (abs_opr1 > 0) { + valx2 = valx2 ? B.CreateFMul(valx2, valx2, "__powx2") : opr0; + if (abs_opr1 & 1) { + nval = nval ? B.CreateFMul(nval, valx2, "__powprod") : valx2; + } + abs_opr1 >>= 1; + } + } + + if (ci_opr1 < 0) { + cnval = ConstantFP::get(eltType, 1.0); + if (getVecSize(FInfo) > 1) { + cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval); + } + nval = B.CreateFDiv(cnval, nval, "__1powprod"); + } + DEBUG(errs() << "AMDIC: " << *CI << " ---> " + << ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0 << ")\n"); + replaceCall(nval); + return true; + } + + // powr ---> exp2(y * log2(x)) + // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31)) + Constant *ExpExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, + FInfo)); + if (!ExpExpr) + return false; + + bool needlog = false; + bool needabs = false; + bool needcopysign = false; + Constant *cnval = nullptr; + if (getVecSize(FInfo) == 1) { + CF = dyn_cast<ConstantFP>(opr0); + + if (CF) { + double V = (getArgType(FInfo) == AMDGPULibFunc::F32) + ? (double)CF->getValueAPF().convertToFloat() + : CF->getValueAPF().convertToDouble(); + + V = log2(std::abs(V)); + cnval = ConstantFP::get(eltType, V); + needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR) && + CF->isNegative(); + } else { + needlog = true; + needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR && + (!CF || CF->isNegative()); + } + } else { + ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr0); + + if (!CDV) { + needlog = true; + needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR; + } else { + assert ((int)CDV->getNumElements() == getVecSize(FInfo) && + "Wrong vector size detected"); + + SmallVector<double, 0> DVal; + for (int i=0; i < getVecSize(FInfo); ++i) { + double V = (getArgType(FInfo) == AMDGPULibFunc::F32) + ? (double)CDV->getElementAsFloat(i) + : CDV->getElementAsDouble(i); + if (V < 0.0) needcopysign = true; + V = log2(std::abs(V)); + DVal.push_back(V); + } + if (getArgType(FInfo) == AMDGPULibFunc::F32) { + SmallVector<float, 0> FVal; + for (unsigned i=0; i < DVal.size(); ++i) { + FVal.push_back((float)DVal[i]); + } + ArrayRef<float> tmp(FVal); + cnval = ConstantDataVector::get(M->getContext(), tmp); + } else { + ArrayRef<double> tmp(DVal); + cnval = ConstantDataVector::get(M->getContext(), tmp); + } + } + } + + if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) { + // We cannot handle corner cases for a general pow() function, give up + // unless y is a constant integral value. Then proceed as if it were pown. + if (getVecSize(FInfo) == 1) { + if (const ConstantFP *CF = dyn_cast<ConstantFP>(opr1)) { + double y = (getArgType(FInfo) == AMDGPULibFunc::F32) + ? (double)CF->getValueAPF().convertToFloat() + : CF->getValueAPF().convertToDouble(); + if (y != (double)(int64_t)y) + return false; + } else + return false; + } else { + if (const ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr1)) { + for (int i=0; i < getVecSize(FInfo); ++i) { + double y = (getArgType(FInfo) == AMDGPULibFunc::F32) + ? (double)CDV->getElementAsFloat(i) + : CDV->getElementAsDouble(i); + if (y != (double)(int64_t)y) + return false; + } + } else + return false; + } + } + + Value *nval; + if (needabs) { + Constant *AbsExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_FABS, + FInfo)); + if (!AbsExpr) + return false; + nval = CreateCallEx(B, AbsExpr, opr0, "__fabs"); + } else { + nval = cnval ? cnval : opr0; + } + if (needlog) { + Constant *LogExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, + FInfo)); + if (!LogExpr) + return false; + nval = CreateCallEx(B,LogExpr, nval, "__log2"); + } + + if (FInfo.getId() == AMDGPULibFunc::EI_POWN) { + // convert int(32) to fp(f32 or f64) + opr1 = B.CreateSIToFP(opr1, nval->getType(), "pownI2F"); + } + nval = B.CreateFMul(opr1, nval, "__ylogx"); + nval = CreateCallEx(B,ExpExpr, nval, "__exp2"); + + if (needcopysign) { + Value *opr_n; + Type* rTy = opr0->getType(); + Type* nTyS = eltType->isDoubleTy() ? B.getInt64Ty() : B.getInt32Ty(); + Type *nTy = nTyS; + if (const VectorType *vTy = dyn_cast<VectorType>(rTy)) + nTy = VectorType::get(nTyS, vTy->getNumElements()); + unsigned size = nTy->getScalarSizeInBits(); + opr_n = CI->getArgOperand(1); + if (opr_n->getType()->isIntegerTy()) + opr_n = B.CreateZExtOrBitCast(opr_n, nTy, "__ytou"); + else + opr_n = B.CreateFPToSI(opr1, nTy, "__ytou"); + + Value *sign = B.CreateShl(opr_n, size-1, "__yeven"); + sign = B.CreateAnd(B.CreateBitCast(opr0, nTy), sign, "__pow_sign"); + nval = B.CreateOr(B.CreateBitCast(nval, nTy), sign); + nval = B.CreateBitCast(nval, opr0->getType()); + } + + DEBUG(errs() << "AMDIC: " << *CI << " ---> " + << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n"); + replaceCall(nval); + + return true; +} + +bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B, + const FuncInfo &FInfo) { + Value *opr0 = CI->getArgOperand(0); + Value *opr1 = CI->getArgOperand(1); + + ConstantInt *CINT = dyn_cast<ConstantInt>(opr1); + if (!CINT) { + return false; + } + int ci_opr1 = (int)CINT->getSExtValue(); + if (ci_opr1 == 1) { // rootn(x, 1) = x + DEBUG(errs() << "AMDIC: " << *CI + << " ---> " << *opr0 << "\n"); + replaceCall(opr0); + return true; + } + if (ci_opr1 == 2) { // rootn(x, 2) = sqrt(x) + std::vector<const Type*> ParamsTys; + ParamsTys.push_back(opr0->getType()); + Module *M = CI->getModule(); + if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, + FInfo))) { + DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n"); + Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2sqrt"); + replaceCall(nval); + return true; + } + } else if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x) + Module *M = CI->getModule(); + if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, + FInfo))) { + DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n"); + Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2cbrt"); + replaceCall(nval); + return true; + } + } else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x + DEBUG(errs() << "AMDIC: " << *CI << " ---> 1.0 / " << *opr0 << "\n"); + Value *nval = B.CreateFDiv(ConstantFP::get(opr0->getType(), 1.0), + opr0, + "__rootn2div"); + replaceCall(nval); + return true; + } else if (ci_opr1 == -2) { // rootn(x, -2) = rsqrt(x) + std::vector<const Type*> ParamsTys; + ParamsTys.push_back(opr0->getType()); + Module *M = CI->getModule(); + if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT, + FInfo))) { + DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0 << ")\n"); + Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2rsqrt"); + replaceCall(nval); + return true; + } + } + return false; +} + +bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B, + const FuncInfo &FInfo) { + Value *opr0 = CI->getArgOperand(0); + Value *opr1 = CI->getArgOperand(1); + Value *opr2 = CI->getArgOperand(2); + + ConstantFP *CF0 = dyn_cast<ConstantFP>(opr0); + ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1); + if ((CF0 && CF0->isZero()) || (CF1 && CF1->isZero())) { + // fma/mad(a, b, c) = c if a=0 || b=0 + DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr2 << "\n"); + replaceCall(opr2); + return true; + } + if (CF0 && CF0->isExactlyValue(1.0f)) { + // fma/mad(a, b, c) = b+c if a=1 + DEBUG(errs() << "AMDIC: " << *CI << " ---> " + << *opr1 << " + " << *opr2 << "\n"); + Value *nval = B.CreateFAdd(opr1, opr2, "fmaadd"); + replaceCall(nval); + return true; + } + if (CF1 && CF1->isExactlyValue(1.0f)) { + // fma/mad(a, b, c) = a+c if b=1 + DEBUG(errs() << "AMDIC: " << *CI << " ---> " + << *opr0 << " + " << *opr2 << "\n"); + Value *nval = B.CreateFAdd(opr0, opr2, "fmaadd"); + replaceCall(nval); + return true; + } + if (ConstantFP *CF = dyn_cast<ConstantFP>(opr2)) { + if (CF->isZero()) { + // fma/mad(a, b, c) = a*b if c=0 + DEBUG(errs() << "AMDIC: " << *CI << " ---> " + << *opr0 << " * " << *opr1 << "\n"); + Value *nval = B.CreateFMul(opr0, opr1, "fmamul"); + replaceCall(nval); + return true; + } + } + + return false; +} + +// Get a scalar native builtin signle argument FP function +Constant* AMDGPULibCalls::getNativeFunction(Module* M, const FuncInfo& FInfo) { + if (getArgType(FInfo) == AMDGPULibFunc::F64 || !HasNative(FInfo.getId())) + return nullptr; + FuncInfo nf = FInfo; + nf.setPrefix(AMDGPULibFunc::NATIVE); + return getFunction(M, nf); +} + +// fold sqrt -> native_sqrt (x) +bool AMDGPULibCalls::fold_sqrt(CallInst *CI, IRBuilder<> &B, + const FuncInfo &FInfo) { + if (getArgType(FInfo) == AMDGPULibFunc::F32 && (getVecSize(FInfo) == 1) && + (FInfo.getPrefix() != AMDGPULibFunc::NATIVE)) { + if (Constant *FPExpr = getNativeFunction( + CI->getModule(), AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) { + Value *opr0 = CI->getArgOperand(0); + DEBUG(errs() << "AMDIC: " << *CI << " ---> " + << "sqrt(" << *opr0 << ")\n"); + Value *nval = CreateCallEx(B,FPExpr, opr0, "__sqrt"); + replaceCall(nval); + return true; + } + } + return false; +} + +// fold sin, cos -> sincos. +bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B, + AliasAnalysis *AA) { + AMDGPULibFunc fInfo; + if (!AMDGPULibFunc::parse(CI->getCalledFunction()->getName(), fInfo)) + return false; + + assert(fInfo.getId() == AMDGPULibFunc::EI_SIN || + fInfo.getId() == AMDGPULibFunc::EI_COS); + bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN; + + Value *CArgVal = CI->getArgOperand(0); + BasicBlock * const CBB = CI->getParent(); + + int const MaxScan = 30; + + { // fold in load value. + LoadInst *LI = dyn_cast<LoadInst>(CArgVal); + if (LI && LI->getParent() == CBB) { + BasicBlock::iterator BBI = LI->getIterator(); + Value *AvailableVal = FindAvailableLoadedValue(LI, CBB, BBI, MaxScan, AA); + if (AvailableVal) { + CArgVal->replaceAllUsesWith(AvailableVal); + if (CArgVal->getNumUses() == 0) + LI->eraseFromParent(); + CArgVal = CI->getArgOperand(0); + } + } + } + + Module *M = CI->getModule(); + fInfo.setId(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN); + std::string const PairName = fInfo.mangle(); + + CallInst *UI = nullptr; + for (User* U : CArgVal->users()) { + CallInst *XI = dyn_cast_or_null<CallInst>(U); + if (!XI || XI == CI || XI->getParent() != CBB) + continue; + + Function *UCallee = XI->getCalledFunction(); + if (!UCallee || !UCallee->getName().equals(PairName)) + continue; + + BasicBlock::iterator BBI = CI->getIterator(); + if (BBI == CI->getParent()->begin()) + break; + --BBI; + for (int I = MaxScan; I > 0 && BBI != CBB->begin(); --BBI, --I) { + if (cast<Instruction>(BBI) == XI) { + UI = XI; + break; + } + } + if (UI) break; + } + + if (!UI) return false; + + // Merge the sin and cos. + + // for OpenCL 2.0 we have only generic implementation of sincos + // function. + AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo); + const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*M); + nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AS.FLAT_ADDRESS); + Function *Fsincos = dyn_cast_or_null<Function>(getFunction(M, nf)); + if (!Fsincos) return false; + + BasicBlock::iterator ItOld = B.GetInsertPoint(); + AllocaInst *Alloc = insertAlloca(UI, B, "__sincos_"); + B.SetInsertPoint(UI); + + Value *P = Alloc; + Type *PTy = Fsincos->getFunctionType()->getParamType(1); + // The allocaInst allocates the memory in private address space. This need + // to be bitcasted to point to the address space of cos pointer type. + // In OpenCL 2.0 this is generic, while in 1.2 that is private. + if (PTy->getPointerAddressSpace() != AS.PRIVATE_ADDRESS) + P = B.CreateAddrSpaceCast(Alloc, PTy); + CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P); + + DEBUG(errs() << "AMDIC: fold_sincos (" << *CI << ", " << *UI + << ") with " << *Call << "\n"); + + if (!isSin) { // CI->cos, UI->sin + B.SetInsertPoint(&*ItOld); + UI->replaceAllUsesWith(&*Call); + Instruction *Reload = B.CreateLoad(Alloc); + CI->replaceAllUsesWith(Reload); + UI->eraseFromParent(); + CI->eraseFromParent(); + } else { // CI->sin, UI->cos + Instruction *Reload = B.CreateLoad(Alloc); + UI->replaceAllUsesWith(Reload); + CI->replaceAllUsesWith(Call); + UI->eraseFromParent(); + CI->eraseFromParent(); + } + return true; +} + +// Get insertion point at entry. +BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) { + Function * Func = UI->getParent()->getParent(); + BasicBlock * BB = &Func->getEntryBlock(); + assert(BB && "Entry block not found!"); + BasicBlock::iterator ItNew = BB->begin(); + return ItNew; +} + +// Insert a AllocsInst at the beginning of function entry block. +AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B, + const char *prefix) { + BasicBlock::iterator ItNew = getEntryIns(UI); + Function *UCallee = UI->getCalledFunction(); + Type *RetType = UCallee->getReturnType(); + B.SetInsertPoint(&*ItNew); + AllocaInst *Alloc = B.CreateAlloca(RetType, 0, + std::string(prefix) + UI->getName()); + Alloc->setAlignment(UCallee->getParent()->getDataLayout() + .getTypeAllocSize(RetType)); + return Alloc; +} + +bool AMDGPULibCalls::evaluateScalarMathFunc(FuncInfo &FInfo, + double& Res0, double& Res1, + Constant *copr0, Constant *copr1, + Constant *copr2) { + // By default, opr0/opr1/opr3 holds values of float/double type. + // If they are not float/double, each function has to its + // operand separately. + double opr0=0.0, opr1=0.0, opr2=0.0; + ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(copr0); + ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(copr1); + ConstantFP *fpopr2 = dyn_cast_or_null<ConstantFP>(copr2); + if (fpopr0) { + opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64) + ? fpopr0->getValueAPF().convertToDouble() + : (double)fpopr0->getValueAPF().convertToFloat(); + } + + if (fpopr1) { + opr1 = (getArgType(FInfo) == AMDGPULibFunc::F64) + ? fpopr1->getValueAPF().convertToDouble() + : (double)fpopr1->getValueAPF().convertToFloat(); + } + + if (fpopr2) { + opr2 = (getArgType(FInfo) == AMDGPULibFunc::F64) + ? fpopr2->getValueAPF().convertToDouble() + : (double)fpopr2->getValueAPF().convertToFloat(); + } + + switch (FInfo.getId()) { + default : return false; + + case AMDGPULibFunc::EI_ACOS: + Res0 = acos(opr0); + return true; + + case AMDGPULibFunc::EI_ACOSH: + // acosh(x) == log(x + sqrt(x*x - 1)) + Res0 = log(opr0 + sqrt(opr0*opr0 - 1.0)); + return true; + + case AMDGPULibFunc::EI_ACOSPI: + Res0 = acos(opr0) / MATH_PI; + return true; + + case AMDGPULibFunc::EI_ASIN: + Res0 = asin(opr0); + return true; + + case AMDGPULibFunc::EI_ASINH: + // asinh(x) == log(x + sqrt(x*x + 1)) + Res0 = log(opr0 + sqrt(opr0*opr0 + 1.0)); + return true; + + case AMDGPULibFunc::EI_ASINPI: + Res0 = asin(opr0) / MATH_PI; + return true; + + case AMDGPULibFunc::EI_ATAN: + Res0 = atan(opr0); + return true; + + case AMDGPULibFunc::EI_ATANH: + // atanh(x) == (log(x+1) - log(x-1))/2; + Res0 = (log(opr0 + 1.0) - log(opr0 - 1.0))/2.0; + return true; + + case AMDGPULibFunc::EI_ATANPI: + Res0 = atan(opr0) / MATH_PI; + return true; + + case AMDGPULibFunc::EI_CBRT: + Res0 = (opr0 < 0.0) ? -pow(-opr0, 1.0/3.0) : pow(opr0, 1.0/3.0); + return true; + + case AMDGPULibFunc::EI_COS: + Res0 = cos(opr0); + return true; + + case AMDGPULibFunc::EI_COSH: + Res0 = cosh(opr0); + return true; + + case AMDGPULibFunc::EI_COSPI: + Res0 = cos(MATH_PI * opr0); + return true; + + case AMDGPULibFunc::EI_EXP: + Res0 = exp(opr0); + return true; + + case AMDGPULibFunc::EI_EXP2: + Res0 = pow(2.0, opr0); + return true; + + case AMDGPULibFunc::EI_EXP10: + Res0 = pow(10.0, opr0); + return true; + + case AMDGPULibFunc::EI_EXPM1: + Res0 = exp(opr0) - 1.0; + return true; + + case AMDGPULibFunc::EI_LOG: + Res0 = log(opr0); + return true; + + case AMDGPULibFunc::EI_LOG2: + Res0 = log(opr0) / log(2.0); + return true; + + case AMDGPULibFunc::EI_LOG10: + Res0 = log(opr0) / log(10.0); + return true; + + case AMDGPULibFunc::EI_RSQRT: + Res0 = 1.0 / sqrt(opr0); + return true; + + case AMDGPULibFunc::EI_SIN: + Res0 = sin(opr0); + return true; + + case AMDGPULibFunc::EI_SINH: + Res0 = sinh(opr0); + return true; + + case AMDGPULibFunc::EI_SINPI: + Res0 = sin(MATH_PI * opr0); + return true; + + case AMDGPULibFunc::EI_SQRT: + Res0 = sqrt(opr0); + return true; + + case AMDGPULibFunc::EI_TAN: + Res0 = tan(opr0); + return true; + + case AMDGPULibFunc::EI_TANH: + Res0 = tanh(opr0); + return true; + + case AMDGPULibFunc::EI_TANPI: + Res0 = tan(MATH_PI * opr0); + return true; + + case AMDGPULibFunc::EI_RECIP: + Res0 = 1.0 / opr0; + return true; + + // two-arg functions + case AMDGPULibFunc::EI_DIVIDE: + Res0 = opr0 / opr1; + return true; + + case AMDGPULibFunc::EI_POW: + case AMDGPULibFunc::EI_POWR: + Res0 = pow(opr0, opr1); + return true; + + case AMDGPULibFunc::EI_POWN: { + if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) { + double val = (double)iopr1->getSExtValue(); + Res0 = pow(opr0, val); + return true; + } + return false; + } + + case AMDGPULibFunc::EI_ROOTN: { + if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) { + double val = (double)iopr1->getSExtValue(); + Res0 = pow(opr0, 1.0 / val); + return true; + } + return false; + } + + // with ptr arg + case AMDGPULibFunc::EI_SINCOS: + Res0 = sin(opr0); + Res1 = cos(opr0); + return true; + + // three-arg functions + case AMDGPULibFunc::EI_FMA: + case AMDGPULibFunc::EI_MAD: + Res0 = opr0 * opr1 + opr2; + return true; + } + + return false; +} + +bool AMDGPULibCalls::evaluateCall(CallInst *aCI, FuncInfo &FInfo) { + int numArgs = (int)aCI->getNumArgOperands(); + if (numArgs > 3) + return false; + + Constant *copr0 = nullptr; + Constant *copr1 = nullptr; + Constant *copr2 = nullptr; + if (numArgs > 0) { + if ((copr0 = dyn_cast<Constant>(aCI->getArgOperand(0))) == nullptr) + return false; + } + + if (numArgs > 1) { + if ((copr1 = dyn_cast<Constant>(aCI->getArgOperand(1))) == nullptr) { + if (FInfo.getId() != AMDGPULibFunc::EI_SINCOS) + return false; + } + } + + if (numArgs > 2) { + if ((copr2 = dyn_cast<Constant>(aCI->getArgOperand(2))) == nullptr) + return false; + } + + // At this point, all arguments to aCI are constants. + + // max vector size is 16, and sincos will generate two results. + double DVal0[16], DVal1[16]; + bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS); + if (getVecSize(FInfo) == 1) { + if (!evaluateScalarMathFunc(FInfo, DVal0[0], + DVal1[0], copr0, copr1, copr2)) { + return false; + } + } else { + ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(copr0); + ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(copr1); + ConstantDataVector *CDV2 = dyn_cast_or_null<ConstantDataVector>(copr2); + for (int i=0; i < getVecSize(FInfo); ++i) { + Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr; + Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr; + Constant *celt2 = CDV2 ? CDV2->getElementAsConstant(i) : nullptr; + if (!evaluateScalarMathFunc(FInfo, DVal0[i], + DVal1[i], celt0, celt1, celt2)) { + return false; + } + } + } + + LLVMContext &context = CI->getParent()->getParent()->getContext(); + Constant *nval0, *nval1; + if (getVecSize(FInfo) == 1) { + nval0 = ConstantFP::get(CI->getType(), DVal0[0]); + if (hasTwoResults) + nval1 = ConstantFP::get(CI->getType(), DVal1[0]); + } else { + if (getArgType(FInfo) == AMDGPULibFunc::F32) { + SmallVector <float, 0> FVal0, FVal1; + for (int i=0; i < getVecSize(FInfo); ++i) + FVal0.push_back((float)DVal0[i]); + ArrayRef<float> tmp0(FVal0); + nval0 = ConstantDataVector::get(context, tmp0); + if (hasTwoResults) { + for (int i=0; i < getVecSize(FInfo); ++i) + FVal1.push_back((float)DVal1[i]); + ArrayRef<float> tmp1(FVal1); + nval1 = ConstantDataVector::get(context, tmp1); + } + } else { + ArrayRef<double> tmp0(DVal0); + nval0 = ConstantDataVector::get(context, tmp0); + if (hasTwoResults) { + ArrayRef<double> tmp1(DVal1); + nval1 = ConstantDataVector::get(context, tmp1); + } + } + } + + if (hasTwoResults) { + // sincos + assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS && + "math function with ptr arg not supported yet"); + new StoreInst(nval1, aCI->getArgOperand(1), aCI); + } + + replaceCall(nval0); + return true; +} + +// Public interface to the Simplify LibCalls pass. +FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt) { + return new AMDGPUSimplifyLibCalls(Opt); +} + +FunctionPass *llvm::createAMDGPUUseNativeCallsPass() { + return new AMDGPUUseNativeCalls(); +} + +static bool setFastFlags(Function &F, const TargetOptions &Options) { + AttrBuilder B; + + if (Options.UnsafeFPMath || Options.NoInfsFPMath) + B.addAttribute("no-infs-fp-math", "true"); + if (Options.UnsafeFPMath || Options.NoNaNsFPMath) + B.addAttribute("no-nans-fp-math", "true"); + if (Options.UnsafeFPMath) { + B.addAttribute("less-precise-fpmad", "true"); + B.addAttribute("unsafe-fp-math", "true"); + } + + if (!B.hasAttributes()) + return false; + + F.addAttributes(AttributeList::FunctionIndex, B); + + return true; +} + +bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + bool Changed = false; + auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + + DEBUG(dbgs() << "AMDIC: process function "; + F.printAsOperand(dbgs(), false, F.getParent()); + dbgs() << '\n';); + + if (!EnablePreLink) + Changed |= setFastFlags(F, Options); + + for (auto &BB : F) { + for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ) { + // Ignore non-calls. + CallInst *CI = dyn_cast<CallInst>(I); + ++I; + if (!CI) continue; + + // Ignore indirect calls. + Function *Callee = CI->getCalledFunction(); + if (Callee == 0) continue; + + DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n"; + dbgs().flush()); + if(Simplifier.fold(CI, AA)) + Changed = true; + } + } + return Changed; +} + +bool AMDGPUUseNativeCalls::runOnFunction(Function &F) { + if (skipFunction(F) || UseNative.empty()) + return false; + + bool Changed = false; + for (auto &BB : F) { + for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ) { + // Ignore non-calls. + CallInst *CI = dyn_cast<CallInst>(I); + ++I; + if (!CI) continue; + + // Ignore indirect calls. + Function *Callee = CI->getCalledFunction(); + if (Callee == 0) continue; + + if(Simplifier.useNative(CI)) + Changed = true; + } + } + return Changed; +} diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/lib/Target/AMDGPU/AMDGPULibFunc.cpp new file mode 100644 index 000000000000..4671273d61f9 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPULibFunc.cpp @@ -0,0 +1,1054 @@ +//===-- AMDGPULibFunc.cpp -------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains utility functions to work with Itanium mangled names +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPULibFunc.h" +#include <llvm/ADT/SmallString.h> +#include <llvm/ADT/SmallVector.h> +#include <llvm/ADT/StringSwitch.h> +#include "llvm/IR/Attributes.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueSymbolTable.h" +#include <llvm/Support/raw_ostream.h> +#include <string> + +using namespace llvm; + +namespace { + +enum EManglingParam { + E_NONE, + EX_EVENT, + EX_FLOAT4, + EX_INTV4, + EX_RESERVEDID, + EX_SAMPLER, + EX_SIZET, + EX_UINT, + EX_UINTV4, + E_ANY, + E_CONSTPTR_ANY, + E_CONSTPTR_SWAPGL, + E_COPY, + E_IMAGECOORDS, + E_POINTEE, + E_SETBASE_I32, + E_SETBASE_U32, + E_MAKEBASE_UNS, + E_V16_OF_POINTEE, + E_V2_OF_POINTEE, + E_V3_OF_POINTEE, + E_V4_OF_POINTEE, + E_V8_OF_POINTEE, + E_VLTLPTR_ANY, +}; + +struct ManglingRule { + StringRef const Name; + unsigned char Lead[2]; + unsigned char Param[5]; + + int maxLeadIndex() const { return (std::max)(Lead[0], Lead[1]); } + int getNumLeads() const { return (Lead[0] ? 1 : 0) + (Lead[1] ? 1 : 0); } + + unsigned getNumArgs() const; +}; + +// Information about library functions with unmangled names. +class UnmangledFuncInfo { + StringRef const Name; + unsigned NumArgs; + + // Table for all lib functions with unmangled names. + static const UnmangledFuncInfo Table[]; + + // Number of entries in Table. + static const unsigned TableSize; + + // Map function name to index. + class NameMap : public StringMap<unsigned> { + public: + NameMap() { + for (unsigned I = 0; I != TableSize; ++I) + (*this)[Table[I].Name] = I; + } + }; + friend class NameMap; + static NameMap Map; + +public: + using ID = AMDGPULibFunc::EFuncId; + UnmangledFuncInfo() = default; + UnmangledFuncInfo(StringRef _Name, unsigned _NumArgs) + : Name(_Name), NumArgs(_NumArgs) {} + // Get index to Table by function name. + static bool lookup(StringRef Name, ID &Id); + static unsigned toIndex(ID Id) { + assert(static_cast<unsigned>(Id) > + static_cast<unsigned>(AMDGPULibFunc::EI_LAST_MANGLED) && + "Invalid unmangled library function"); + return static_cast<unsigned>(Id) - 1 - + static_cast<unsigned>(AMDGPULibFunc::EI_LAST_MANGLED); + } + static ID toFuncId(unsigned Index) { + assert(Index < TableSize && "Invalid unmangled library function"); + return static_cast<ID>( + Index + 1 + static_cast<unsigned>(AMDGPULibFunc::EI_LAST_MANGLED)); + } + static unsigned getNumArgs(ID Id) { return Table[toIndex(Id)].NumArgs; } + static StringRef getName(ID Id) { return Table[toIndex(Id)].Name; } +}; + +unsigned ManglingRule::getNumArgs() const { + unsigned I=0; + while (I < (sizeof Param/sizeof Param[0]) && Param[I]) ++I; + return I; +} + +// This table describes function formal argument type rules. The order of rules +// corresponds to the EFuncId enum at AMDGPULibFunc.h +// +// "<func name>", { <leads> }, { <param rules> } +// where: +// <leads> - list of integers that are one-based indexes of formal argument +// used to mangle a function name. Other argument types are derived from types +// of these 'leads'. The order of integers in this list correspond to the +// order in which these arguments are mangled in the EDG mangling scheme. The +// same order should be preserved for arguments in the AMDGPULibFunc structure +// when it is used for mangling. For example: +// { "vstorea_half", {3,1}, {E_ANY,EX_SIZET,E_ANY}}, +// will be mangled in EDG scheme as vstorea_half_<3dparam>_<1stparam> +// When mangling from code use: +// AMDGPULibFunc insc; +// insc.param[0] = ... // describe 3rd parameter +// insc.param[1] = ... // describe 1rd parameter +// +// <param rules> - list of rules used to derive all of the function formal +// argument types. EX_ prefixed are simple types, other derived from the +// latest 'lead' argument type in the order of encoding from first to last. +// E_ANY - use prev lead type, E_CONSTPTR_ANY - make const pointer out of +// prev lead type, etc. see ParamIterator::getNextParam() for details. + +static const ManglingRule manglingRules[] = { +{ StringRef(), {0}, {0} }, +{ "abs" , {1}, {E_ANY}}, +{ "abs_diff" , {1}, {E_ANY,E_COPY}}, +{ "acos" , {1}, {E_ANY}}, +{ "acosh" , {1}, {E_ANY}}, +{ "acospi" , {1}, {E_ANY}}, +{ "add_sat" , {1}, {E_ANY,E_COPY}}, +{ "all" , {1}, {E_ANY}}, +{ "any" , {1}, {E_ANY}}, +{ "asin" , {1}, {E_ANY}}, +{ "asinh" , {1}, {E_ANY}}, +{ "asinpi" , {1}, {E_ANY}}, +{ "async_work_group_copy" , {1}, {E_ANY,E_CONSTPTR_SWAPGL,EX_SIZET,EX_EVENT}}, +{ "async_work_group_strided_copy" , {1}, {E_ANY,E_CONSTPTR_SWAPGL,EX_SIZET,EX_SIZET,EX_EVENT}}, +{ "atan" , {1}, {E_ANY}}, +{ "atan2" , {1}, {E_ANY,E_COPY}}, +{ "atan2pi" , {1}, {E_ANY,E_COPY}}, +{ "atanh" , {1}, {E_ANY}}, +{ "atanpi" , {1}, {E_ANY}}, +{ "atomic_add" , {1}, {E_VLTLPTR_ANY,E_POINTEE}}, +{ "atomic_and" , {1}, {E_VLTLPTR_ANY,E_POINTEE}}, +{ "atomic_cmpxchg" , {1}, {E_VLTLPTR_ANY,E_POINTEE,E_POINTEE}}, +{ "atomic_dec" , {1}, {E_VLTLPTR_ANY}}, +{ "atomic_inc" , {1}, {E_VLTLPTR_ANY}}, +{ "atomic_max" , {1}, {E_VLTLPTR_ANY,E_POINTEE}}, +{ "atomic_min" , {1}, {E_VLTLPTR_ANY,E_POINTEE}}, +{ "atomic_or" , {1}, {E_VLTLPTR_ANY,E_POINTEE}}, +{ "atomic_sub" , {1}, {E_VLTLPTR_ANY,E_POINTEE}}, +{ "atomic_xchg" , {1}, {E_VLTLPTR_ANY,E_POINTEE}}, +{ "atomic_xor" , {1}, {E_VLTLPTR_ANY,E_POINTEE}}, +{ "bitselect" , {1}, {E_ANY,E_COPY,E_COPY}}, +{ "cbrt" , {1}, {E_ANY}}, +{ "ceil" , {1}, {E_ANY}}, +{ "clamp" , {1}, {E_ANY,E_COPY,E_COPY}}, +{ "clz" , {1}, {E_ANY}}, +{ "commit_read_pipe" , {1}, {E_ANY,EX_RESERVEDID}}, +{ "commit_write_pipe" , {1}, {E_ANY,EX_RESERVEDID}}, +{ "copysign" , {1}, {E_ANY,E_COPY}}, +{ "cos" , {1}, {E_ANY}}, +{ "cosh" , {1}, {E_ANY}}, +{ "cospi" , {1}, {E_ANY}}, +{ "cross" , {1}, {E_ANY,E_COPY}}, +{ "ctz" , {1}, {E_ANY}}, +{ "degrees" , {1}, {E_ANY}}, +{ "distance" , {1}, {E_ANY,E_COPY}}, +{ "divide" , {1}, {E_ANY,E_COPY}}, +{ "dot" , {1}, {E_ANY,E_COPY}}, +{ "erf" , {1}, {E_ANY}}, +{ "erfc" , {1}, {E_ANY}}, +{ "exp" , {1}, {E_ANY}}, +{ "exp10" , {1}, {E_ANY}}, +{ "exp2" , {1}, {E_ANY}}, +{ "expm1" , {1}, {E_ANY}}, +{ "fabs" , {1}, {E_ANY}}, +{ "fast_distance" , {1}, {E_ANY,E_COPY}}, +{ "fast_length" , {1}, {E_ANY}}, +{ "fast_normalize" , {1}, {E_ANY}}, +{ "fdim" , {1}, {E_ANY,E_COPY}}, +{ "floor" , {1}, {E_ANY}}, +{ "fma" , {1}, {E_ANY,E_COPY,E_COPY}}, +{ "fmax" , {1}, {E_ANY,E_COPY}}, +{ "fmin" , {1}, {E_ANY,E_COPY}}, +{ "fmod" , {1}, {E_ANY,E_COPY}}, +{ "fract" , {2}, {E_POINTEE,E_ANY}}, +{ "frexp" , {1,2}, {E_ANY,E_ANY}}, +{ "get_image_array_size" , {1}, {E_ANY}}, +{ "get_image_channel_data_type" , {1}, {E_ANY}}, +{ "get_image_channel_order" , {1}, {E_ANY}}, +{ "get_image_dim" , {1}, {E_ANY}}, +{ "get_image_height" , {1}, {E_ANY}}, +{ "get_image_width" , {1}, {E_ANY}}, +{ "get_pipe_max_packets" , {1}, {E_ANY}}, +{ "get_pipe_num_packets" , {1}, {E_ANY}}, +{ "hadd" , {1}, {E_ANY,E_COPY}}, +{ "hypot" , {1}, {E_ANY,E_COPY}}, +{ "ilogb" , {1}, {E_ANY}}, +{ "isequal" , {1}, {E_ANY,E_COPY}}, +{ "isfinite" , {1}, {E_ANY}}, +{ "isgreater" , {1}, {E_ANY,E_COPY}}, +{ "isgreaterequal" , {1}, {E_ANY,E_COPY}}, +{ "isinf" , {1}, {E_ANY}}, +{ "isless" , {1}, {E_ANY,E_COPY}}, +{ "islessequal" , {1}, {E_ANY,E_COPY}}, +{ "islessgreater" , {1}, {E_ANY,E_COPY}}, +{ "isnan" , {1}, {E_ANY}}, +{ "isnormal" , {1}, {E_ANY}}, +{ "isnotequal" , {1}, {E_ANY,E_COPY}}, +{ "isordered" , {1}, {E_ANY,E_COPY}}, +{ "isunordered" , {1}, {E_ANY,E_COPY}}, +{ "ldexp" , {1}, {E_ANY,E_SETBASE_I32}}, +{ "length" , {1}, {E_ANY}}, +{ "lgamma" , {1}, {E_ANY}}, +{ "lgamma_r" , {1,2}, {E_ANY,E_ANY}}, +{ "log" , {1}, {E_ANY}}, +{ "log10" , {1}, {E_ANY}}, +{ "log1p" , {1}, {E_ANY}}, +{ "log2" , {1}, {E_ANY}}, +{ "logb" , {1}, {E_ANY}}, +{ "mad" , {1}, {E_ANY,E_COPY,E_COPY}}, +{ "mad24" , {1}, {E_ANY,E_COPY,E_COPY}}, +{ "mad_hi" , {1}, {E_ANY,E_COPY,E_COPY}}, +{ "mad_sat" , {1}, {E_ANY,E_COPY,E_COPY}}, +{ "max" , {1}, {E_ANY,E_COPY}}, +{ "maxmag" , {1}, {E_ANY,E_COPY}}, +{ "min" , {1}, {E_ANY,E_COPY}}, +{ "minmag" , {1}, {E_ANY,E_COPY}}, +{ "mix" , {1}, {E_ANY,E_COPY,E_COPY}}, +{ "modf" , {2}, {E_POINTEE,E_ANY}}, +{ "mul24" , {1}, {E_ANY,E_COPY}}, +{ "mul_hi" , {1}, {E_ANY,E_COPY}}, +{ "nan" , {1}, {E_ANY}}, +{ "nextafter" , {1}, {E_ANY,E_COPY}}, +{ "normalize" , {1}, {E_ANY}}, +{ "popcount" , {1}, {E_ANY}}, +{ "pow" , {1}, {E_ANY,E_COPY}}, +{ "pown" , {1}, {E_ANY,E_SETBASE_I32}}, +{ "powr" , {1}, {E_ANY,E_COPY}}, +{ "prefetch" , {1}, {E_CONSTPTR_ANY,EX_SIZET}}, +{ "radians" , {1}, {E_ANY}}, +{ "recip" , {1}, {E_ANY}}, +{ "remainder" , {1}, {E_ANY,E_COPY}}, +{ "remquo" , {1,3}, {E_ANY,E_COPY,E_ANY}}, +{ "reserve_read_pipe" , {1}, {E_ANY,EX_UINT}}, +{ "reserve_write_pipe" , {1}, {E_ANY,EX_UINT}}, +{ "rhadd" , {1}, {E_ANY,E_COPY}}, +{ "rint" , {1}, {E_ANY}}, +{ "rootn" , {1}, {E_ANY,E_SETBASE_I32}}, +{ "rotate" , {1}, {E_ANY,E_COPY}}, +{ "round" , {1}, {E_ANY}}, +{ "rsqrt" , {1}, {E_ANY}}, +{ "select" , {1,3}, {E_ANY,E_COPY,E_ANY}}, +{ "shuffle" , {1,2}, {E_ANY,E_ANY}}, +{ "shuffle2" , {1,3}, {E_ANY,E_COPY,E_ANY}}, +{ "sign" , {1}, {E_ANY}}, +{ "signbit" , {1}, {E_ANY}}, +{ "sin" , {1}, {E_ANY}}, +{ "sincos" , {2}, {E_POINTEE,E_ANY}}, +{ "sinh" , {1}, {E_ANY}}, +{ "sinpi" , {1}, {E_ANY}}, +{ "smoothstep" , {1}, {E_ANY,E_COPY,E_COPY}}, +{ "sqrt" , {1}, {E_ANY}}, +{ "step" , {1}, {E_ANY,E_COPY}}, +{ "sub_group_broadcast" , {1}, {E_ANY,EX_UINT}}, +{ "sub_group_commit_read_pipe" , {1}, {E_ANY,EX_RESERVEDID}}, +{ "sub_group_commit_write_pipe" , {1}, {E_ANY,EX_RESERVEDID}}, +{ "sub_group_reduce_add" , {1}, {E_ANY}}, +{ "sub_group_reduce_max" , {1}, {E_ANY}}, +{ "sub_group_reduce_min" , {1}, {E_ANY}}, +{ "sub_group_reserve_read_pipe" , {1}, {E_ANY,EX_UINT}}, +{ "sub_group_reserve_write_pipe" , {1}, {E_ANY,EX_UINT}}, +{ "sub_group_scan_exclusive_add" , {1}, {E_ANY}}, +{ "sub_group_scan_exclusive_max" , {1}, {E_ANY}}, +{ "sub_group_scan_exclusive_min" , {1}, {E_ANY}}, +{ "sub_group_scan_inclusive_add" , {1}, {E_ANY}}, +{ "sub_group_scan_inclusive_max" , {1}, {E_ANY}}, +{ "sub_group_scan_inclusive_min" , {1}, {E_ANY}}, +{ "sub_sat" , {1}, {E_ANY,E_COPY}}, +{ "tan" , {1}, {E_ANY}}, +{ "tanh" , {1}, {E_ANY}}, +{ "tanpi" , {1}, {E_ANY}}, +{ "tgamma" , {1}, {E_ANY}}, +{ "trunc" , {1}, {E_ANY}}, +{ "upsample" , {1}, {E_ANY,E_MAKEBASE_UNS}}, +{ "vec_step" , {1}, {E_ANY}}, +{ "vstore" , {3}, {E_POINTEE,EX_SIZET,E_ANY}}, +{ "vstore16" , {3}, {E_V16_OF_POINTEE,EX_SIZET,E_ANY}}, +{ "vstore2" , {3}, {E_V2_OF_POINTEE,EX_SIZET,E_ANY}}, +{ "vstore3" , {3}, {E_V3_OF_POINTEE,EX_SIZET,E_ANY}}, +{ "vstore4" , {3}, {E_V4_OF_POINTEE,EX_SIZET,E_ANY}}, +{ "vstore8" , {3}, {E_V8_OF_POINTEE,EX_SIZET,E_ANY}}, +{ "work_group_commit_read_pipe" , {1}, {E_ANY,EX_RESERVEDID}}, +{ "work_group_commit_write_pipe" , {1}, {E_ANY,EX_RESERVEDID}}, +{ "work_group_reduce_add" , {1}, {E_ANY}}, +{ "work_group_reduce_max" , {1}, {E_ANY}}, +{ "work_group_reduce_min" , {1}, {E_ANY}}, +{ "work_group_reserve_read_pipe" , {1}, {E_ANY,EX_UINT}}, +{ "work_group_reserve_write_pipe" , {1}, {E_ANY,EX_UINT}}, +{ "work_group_scan_exclusive_add" , {1}, {E_ANY}}, +{ "work_group_scan_exclusive_max" , {1}, {E_ANY}}, +{ "work_group_scan_exclusive_min" , {1}, {E_ANY}}, +{ "work_group_scan_inclusive_add" , {1}, {E_ANY}}, +{ "work_group_scan_inclusive_max" , {1}, {E_ANY}}, +{ "work_group_scan_inclusive_min" , {1}, {E_ANY}}, +{ "write_imagef" , {1}, {E_ANY,E_IMAGECOORDS,EX_FLOAT4}}, +{ "write_imagei" , {1}, {E_ANY,E_IMAGECOORDS,EX_INTV4}}, +{ "write_imageui" , {1}, {E_ANY,E_IMAGECOORDS,EX_UINTV4}}, +{ "ncos" , {1}, {E_ANY} }, +{ "nexp2" , {1}, {E_ANY} }, +{ "nfma" , {1}, {E_ANY, E_COPY, E_COPY} }, +{ "nlog2" , {1}, {E_ANY} }, +{ "nrcp" , {1}, {E_ANY} }, +{ "nrsqrt" , {1}, {E_ANY} }, +{ "nsin" , {1}, {E_ANY} }, +{ "nsqrt" , {1}, {E_ANY} }, +{ "ftz" , {1}, {E_ANY} }, +{ "fldexp" , {1}, {E_ANY, EX_UINT} }, +{ "class" , {1}, {E_ANY, EX_UINT} }, +{ "rcbrt" , {1}, {E_ANY} }, +}; + +// Library functions with unmangled name. +const UnmangledFuncInfo UnmangledFuncInfo::Table[] = { + {"__read_pipe_2", 4}, + {"__read_pipe_4", 6}, + {"__write_pipe_2", 4}, + {"__write_pipe_4", 6}, +}; + +const unsigned UnmangledFuncInfo::TableSize = + sizeof(UnmangledFuncInfo::Table) / sizeof(UnmangledFuncInfo::Table[0]); + +UnmangledFuncInfo::NameMap UnmangledFuncInfo::Map; + +static const struct ManglingRulesMap : public StringMap<int> { + ManglingRulesMap() + : StringMap<int>(sizeof(manglingRules)/sizeof(manglingRules[0])) { + int Id = 0; + for (auto Rule : manglingRules) + insert({ Rule.Name, Id++ }); + } +} manglingRulesMap; + +static AMDGPULibFunc::Param getRetType(AMDGPULibFunc::EFuncId id, + const AMDGPULibFunc::Param (&Leads)[2]) { + AMDGPULibFunc::Param Res = Leads[0]; + // TBD - This switch may require to be extended for other intriniscs + switch (id) { + case AMDGPULibFunc::EI_SINCOS: + Res.PtrKind = AMDGPULibFunc::BYVALUE; + break; + default: + break; + } + return Res; +} + +class ParamIterator { + const AMDGPULibFunc::Param (&Leads)[2]; + const ManglingRule& Rule; + int Index; +public: + ParamIterator(const AMDGPULibFunc::Param (&leads)[2], + const ManglingRule& rule) + : Leads(leads), Rule(rule), Index(0) {} + + AMDGPULibFunc::Param getNextParam(); +}; + +AMDGPULibFunc::Param ParamIterator::getNextParam() { + AMDGPULibFunc::Param P; + if (Index >= int(sizeof Rule.Param/sizeof Rule.Param[0])) return P; + + const char R = Rule.Param[Index]; + switch (R) { + case E_NONE: break; + case EX_UINT: + P.ArgType = AMDGPULibFunc::U32; break; + case EX_INTV4: + P.ArgType = AMDGPULibFunc::I32; P.VectorSize = 4; break; + case EX_UINTV4: + P.ArgType = AMDGPULibFunc::U32; P.VectorSize = 4; break; + case EX_FLOAT4: + P.ArgType = AMDGPULibFunc::F32; P.VectorSize = 4; break; + case EX_SIZET: + P.ArgType = AMDGPULibFunc::U64; break; + case EX_EVENT: + P.ArgType = AMDGPULibFunc::EVENT; break; + case EX_SAMPLER: + P.ArgType = AMDGPULibFunc::SAMPLER; break; + case EX_RESERVEDID: break; // TBD + default: + if (Index == (Rule.Lead[1] - 1)) P = Leads[1]; + else P = Leads[0]; + + switch (R) { + case E_ANY: + case E_COPY: break; + + case E_POINTEE: + P.PtrKind = AMDGPULibFunc::BYVALUE; break; + case E_V2_OF_POINTEE: + P.VectorSize = 2; P.PtrKind = AMDGPULibFunc::BYVALUE; break; + case E_V3_OF_POINTEE: + P.VectorSize = 3; P.PtrKind = AMDGPULibFunc::BYVALUE; break; + case E_V4_OF_POINTEE: + P.VectorSize = 4; P.PtrKind = AMDGPULibFunc::BYVALUE; break; + case E_V8_OF_POINTEE: + P.VectorSize = 8; P.PtrKind = AMDGPULibFunc::BYVALUE; break; + case E_V16_OF_POINTEE: + P.VectorSize = 16; P.PtrKind = AMDGPULibFunc::BYVALUE; break; + case E_CONSTPTR_ANY: + P.PtrKind |= AMDGPULibFunc::CONST; break; + case E_VLTLPTR_ANY: + P.PtrKind |= AMDGPULibFunc::VOLATILE; break; + case E_SETBASE_I32: + P.ArgType = AMDGPULibFunc::I32; break; + case E_SETBASE_U32: + P.ArgType = AMDGPULibFunc::U32; break; + + case E_MAKEBASE_UNS: + P.ArgType &= ~AMDGPULibFunc::BASE_TYPE_MASK; + P.ArgType |= AMDGPULibFunc::UINT; + break; + + case E_IMAGECOORDS: + switch (P.ArgType) { + case AMDGPULibFunc::IMG1DA: P.VectorSize = 2; break; + case AMDGPULibFunc::IMG1DB: P.VectorSize = 1; break; + case AMDGPULibFunc::IMG2DA: P.VectorSize = 4; break; + case AMDGPULibFunc::IMG1D: P.VectorSize = 1; break; + case AMDGPULibFunc::IMG2D: P.VectorSize = 2; break; + case AMDGPULibFunc::IMG3D: P.VectorSize = 4; break; + } + P.PtrKind = AMDGPULibFunc::BYVALUE; + P.ArgType = AMDGPULibFunc::I32; + break; + + case E_CONSTPTR_SWAPGL: { + unsigned AS = AMDGPULibFunc::getAddrSpaceFromEPtrKind(P.PtrKind); + switch (AS) { + case AMDGPUAS::GLOBAL_ADDRESS: AS = AMDGPUAS::LOCAL_ADDRESS; break; + case AMDGPUAS::LOCAL_ADDRESS: AS = AMDGPUAS::GLOBAL_ADDRESS; break; + } + P.PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AS); + P.PtrKind |= AMDGPULibFunc::CONST; + break; + } + + default: llvm_unreachable("Unhandeled param rule"); + } + } + ++Index; + return P; +} + +inline static void drop_front(StringRef& str, size_t n = 1) { + str = str.drop_front(n); +} + +static bool eatTerm(StringRef& mangledName, const char c) { + if (mangledName.front() == c) { + drop_front(mangledName); + return true; + } + return false; +} + +template <size_t N> +static bool eatTerm(StringRef& mangledName, const char (&str)[N]) { + if (mangledName.startswith(StringRef(str, N-1))) { + drop_front(mangledName, N-1); + return true; + } + return false; +} + +static inline bool isDigit(char c) { return c >= '0' && c <= '9'; } + +static int eatNumber(StringRef& s) { + size_t const savedSize = s.size(); + int n = 0; + while (!s.empty() && isDigit(s.front())) { + n = n*10 + s.front() - '0'; + drop_front(s); + } + return s.size() < savedSize ? n : -1; +} + +static StringRef eatLengthPrefixedName(StringRef& mangledName) { + int const Len = eatNumber(mangledName); + if (Len <= 0 || static_cast<size_t>(Len) > mangledName.size()) + return StringRef(); + StringRef Res = mangledName.substr(0, Len); + drop_front(mangledName, Len); + return Res; +} + +} // end anonymous namespace + +AMDGPUMangledLibFunc::AMDGPUMangledLibFunc() { + FuncId = EI_NONE; + FKind = NOPFX; + Leads[0].reset(); + Leads[1].reset(); + Name.clear(); +} + +AMDGPUUnmangledLibFunc::AMDGPUUnmangledLibFunc() { + FuncId = EI_NONE; + FuncTy = nullptr; +} + +AMDGPUMangledLibFunc::AMDGPUMangledLibFunc( + EFuncId id, const AMDGPUMangledLibFunc ©From) { + FuncId = id; + FKind = copyFrom.FKind; + Leads[0] = copyFrom.Leads[0]; + Leads[1] = copyFrom.Leads[1]; +} + +/////////////////////////////////////////////////////////////////////////////// +// Demangling + +static int parseVecSize(StringRef& mangledName) { + size_t const Len = eatNumber(mangledName); + switch (Len) { + case 2: case 3: case 4: case 8: case 16: + return Len; + default: + break; + } + return 1; +} + +static AMDGPULibFunc::ENamePrefix parseNamePrefix(StringRef& mangledName) { + std::pair<StringRef, StringRef> const P = mangledName.split('_'); + AMDGPULibFunc::ENamePrefix Pfx = + StringSwitch<AMDGPULibFunc::ENamePrefix>(P.first) + .Case("native", AMDGPULibFunc::NATIVE) + .Case("half" , AMDGPULibFunc::HALF) + .Default(AMDGPULibFunc::NOPFX); + + if (Pfx != AMDGPULibFunc::NOPFX) + mangledName = P.second; + + return Pfx; +} + +bool AMDGPUMangledLibFunc::parseUnmangledName(StringRef FullName) { + FuncId = static_cast<EFuncId>(manglingRulesMap.lookup(FullName)); + return FuncId != EI_NONE; +} + +/////////////////////////////////////////////////////////////////////////////// +// Itanium Demangling + +namespace { +struct ItaniumParamParser { + AMDGPULibFunc::Param Prev; + bool parseItaniumParam(StringRef& param, AMDGPULibFunc::Param &res); +}; +} // namespace + +bool ItaniumParamParser::parseItaniumParam(StringRef& param, + AMDGPULibFunc::Param &res) { + res.reset(); + if (param.empty()) return false; + + // parse pointer prefix + if (eatTerm(param, 'P')) { + if (eatTerm(param, 'K')) res.PtrKind |= AMDGPULibFunc::CONST; + if (eatTerm(param, 'V')) res.PtrKind |= AMDGPULibFunc::VOLATILE; + unsigned AS; + if (!eatTerm(param, "U3AS")) { + AS = 0; + } else { + AS = param.front() - '0'; + drop_front(param, 1); + } + res.PtrKind |= AMDGPULibFuncBase::getEPtrKindFromAddrSpace(AS); + } else { + res.PtrKind = AMDGPULibFunc::BYVALUE; + } + + // parse vector size + if (eatTerm(param,"Dv")) { + res.VectorSize = parseVecSize(param); + if (res.VectorSize==1 || !eatTerm(param, '_')) return false; + } + + // parse type + char const TC = param.front(); + if (::isDigit(TC)) { + res.ArgType = StringSwitch<AMDGPULibFunc::EType> + (eatLengthPrefixedName(param)) + .Case("ocl_image1darray" , AMDGPULibFunc::IMG1DA) + .Case("ocl_image1dbuffer", AMDGPULibFunc::IMG1DB) + .Case("ocl_image2darray" , AMDGPULibFunc::IMG2DA) + .Case("ocl_image1d" , AMDGPULibFunc::IMG1D) + .Case("ocl_image2d" , AMDGPULibFunc::IMG2D) + .Case("ocl_image3d" , AMDGPULibFunc::IMG3D) + .Case("ocl_event" , AMDGPULibFunc::DUMMY) + .Case("ocl_sampler" , AMDGPULibFunc::DUMMY) + .Default(AMDGPULibFunc::DUMMY); + } else { + drop_front(param); + switch (TC) { + case 'h': res.ArgType = AMDGPULibFunc::U8; break; + case 't': res.ArgType = AMDGPULibFunc::U16; break; + case 'j': res.ArgType = AMDGPULibFunc::U32; break; + case 'm': res.ArgType = AMDGPULibFunc::U64; break; + case 'c': res.ArgType = AMDGPULibFunc::I8; break; + case 's': res.ArgType = AMDGPULibFunc::I16; break; + case 'i': res.ArgType = AMDGPULibFunc::I32; break; + case 'l': res.ArgType = AMDGPULibFunc::I64; break; + case 'f': res.ArgType = AMDGPULibFunc::F32; break; + case 'd': res.ArgType = AMDGPULibFunc::F64; break; + case 'D': if (!eatTerm(param, 'h')) return false; + res.ArgType = AMDGPULibFunc::F16; break; + case 'S': + if (!eatTerm(param, '_')) { + eatNumber(param); + if (!eatTerm(param, '_')) return false; + } + res.VectorSize = Prev.VectorSize; + res.ArgType = Prev.ArgType; + break; + default:; + } + } + if (res.ArgType == 0) return false; + Prev.VectorSize = res.VectorSize; + Prev.ArgType = res.ArgType; + return true; +} + +bool AMDGPUMangledLibFunc::parseFuncName(StringRef &mangledName) { + StringRef Name = eatLengthPrefixedName(mangledName); + FKind = parseNamePrefix(Name); + if (!parseUnmangledName(Name)) + return false; + + const ManglingRule& Rule = manglingRules[FuncId]; + ItaniumParamParser Parser; + for (int I=0; I < Rule.maxLeadIndex(); ++I) { + Param P; + if (!Parser.parseItaniumParam(mangledName, P)) + return false; + + if ((I + 1) == Rule.Lead[0]) Leads[0] = P; + if ((I + 1) == Rule.Lead[1]) Leads[1] = P; + } + return true; +} + +bool AMDGPUUnmangledLibFunc::parseFuncName(StringRef &Name) { + if (!UnmangledFuncInfo::lookup(Name, FuncId)) + return false; + setName(Name); + return true; +} + +bool AMDGPULibFunc::parse(StringRef FuncName, AMDGPULibFunc &F) { + if (FuncName.empty()) { + F.Impl = std::unique_ptr<AMDGPULibFuncImpl>(); + return false; + } + + if (eatTerm(FuncName, "_Z")) + F.Impl = make_unique<AMDGPUMangledLibFunc>(); + else + F.Impl = make_unique<AMDGPUUnmangledLibFunc>(); + if (F.Impl->parseFuncName(FuncName)) + return true; + + F.Impl = std::unique_ptr<AMDGPULibFuncImpl>(); + return false; +} + +StringRef AMDGPUMangledLibFunc::getUnmangledName(StringRef mangledName) { + StringRef S = mangledName; + if (eatTerm(S, "_Z")) + return eatLengthPrefixedName(S); + return StringRef(); +} + +/////////////////////////////////////////////////////////////////////////////// +// Mangling + +template <typename Stream> +void AMDGPUMangledLibFunc::writeName(Stream &OS) const { + const char *Pfx = ""; + switch (FKind) { + case NATIVE: Pfx = "native_"; break; + case HALF: Pfx = "half_"; break; + default: break; + } + if (!Name.empty()) { + OS << Pfx << Name; + } else if (FuncId != EI_NONE) { + OS << Pfx; + const StringRef& S = manglingRules[FuncId].Name; + OS.write(S.data(), S.size()); + } +} + +std::string AMDGPUMangledLibFunc::mangle() const { return mangleNameItanium(); } + +/////////////////////////////////////////////////////////////////////////////// +// Itanium Mangling + +static const char *getItaniumTypeName(AMDGPULibFunc::EType T) { + switch (T) { + case AMDGPULibFunc::U8: return "h"; + case AMDGPULibFunc::U16: return "t"; + case AMDGPULibFunc::U32: return "j"; + case AMDGPULibFunc::U64: return "m"; + case AMDGPULibFunc::I8: return "c"; + case AMDGPULibFunc::I16: return "s"; + case AMDGPULibFunc::I32: return "i"; + case AMDGPULibFunc::I64: return "l"; + case AMDGPULibFunc::F16: return "Dh"; + case AMDGPULibFunc::F32: return "f"; + case AMDGPULibFunc::F64: return "d"; + case AMDGPULibFunc::IMG1DA: return "16ocl_image1darray"; + case AMDGPULibFunc::IMG1DB: return "17ocl_image1dbuffer"; + case AMDGPULibFunc::IMG2DA: return "16ocl_image2darray"; + case AMDGPULibFunc::IMG1D: return "11ocl_image1d"; + case AMDGPULibFunc::IMG2D: return "11ocl_image2d"; + case AMDGPULibFunc::IMG3D: return "11ocl_image3d"; + case AMDGPULibFunc::SAMPLER: return "11ocl_sampler"; + case AMDGPULibFunc::EVENT: return "9ocl_event"; + default: llvm_unreachable("Unhandeled param type"); + } + return nullptr; +} + +namespace { +// Itanium mangling ABI says: +// "5.1.8. Compression +// ... Each non-terminal in the grammar for which <substitution> appears on the +// right-hand side is both a source of future substitutions and a candidate +// for being substituted. There are two exceptions that appear to be +// substitution candidates from the grammar, but are explicitly excluded: +// 1. <builtin-type> other than vendor extended types ..." + +// For the purpose of functions the following productions make sence for the +// substitution: +// <type> ::= <builtin-type> +// ::= <class-enum-type> +// ::= <array-type> +// ::=<CV-qualifiers> <type> +// ::= P <type> # pointer-to +// ::= <substitution> +// +// Note that while types like images, samplers and events are by the ABI encoded +// using <class-enum-type> production rule they're not used for substitution +// because clang consider them as builtin types. +// +// DvNN_ type is GCC extension for vectors and is a subject for the substitution. + + +class ItaniumMangler { + SmallVector<AMDGPULibFunc::Param, 10> Str; // list of accumulated substituions + bool UseAddrSpace; + + int findSubst(const AMDGPULibFunc::Param& P) const { + for(unsigned I = 0; I < Str.size(); ++I) { + const AMDGPULibFunc::Param& T = Str[I]; + if (P.PtrKind == T.PtrKind && + P.VectorSize == T.VectorSize && + P.ArgType == T.ArgType) { + return I; + } + } + return -1; + } + + template <typename Stream> + bool trySubst(Stream& os, const AMDGPULibFunc::Param& p) { + int const subst = findSubst(p); + if (subst < 0) return false; + // Substitutions are mangled as S(XX)?_ where XX is a hexadecimal number + // 0 1 2 + // S_ S0_ S1_ + if (subst == 0) os << "S_"; + else os << 'S' << (subst-1) << '_'; + return true; + } + +public: + ItaniumMangler(bool useAddrSpace) + : UseAddrSpace(useAddrSpace) {} + + template <typename Stream> + void operator()(Stream& os, AMDGPULibFunc::Param p) { + + // Itanium mangling ABI 5.1.8. Compression: + // Logically, the substitutable components of a mangled name are considered + // left-to-right, components before the composite structure of which they + // are a part. If a component has been encountered before, it is substituted + // as described below. This decision is independent of whether its components + // have been substituted, so an implementation may optimize by considering + // large structures for substitution before their components. If a component + // has not been encountered before, its mangling is identified, and it is + // added to a dictionary of substitution candidates. No entity is added to + // the dictionary twice. + AMDGPULibFunc::Param Ptr; + + if (p.PtrKind) { + if (trySubst(os, p)) return; + os << 'P'; + if (p.PtrKind & AMDGPULibFunc::CONST) os << 'K'; + if (p.PtrKind & AMDGPULibFunc::VOLATILE) os << 'V'; + unsigned AS = UseAddrSpace + ? AMDGPULibFuncBase::getAddrSpaceFromEPtrKind(p.PtrKind) + : 0; + if (AS != 0) os << "U3AS" << AS; + Ptr = p; + p.PtrKind = 0; + } + + if (p.VectorSize > 1) { + if (trySubst(os, p)) goto exit; + Str.push_back(p); + os << "Dv" << static_cast<unsigned>(p.VectorSize) << '_'; + } + + os << getItaniumTypeName((AMDGPULibFunc::EType)p.ArgType); + + exit: + if (Ptr.ArgType) Str.push_back(Ptr); + } +}; +} // namespace + +std::string AMDGPUMangledLibFunc::mangleNameItanium() const { + SmallString<128> Buf; + raw_svector_ostream S(Buf); + SmallString<128> NameBuf; + raw_svector_ostream Name(NameBuf); + writeName(Name); + const StringRef& NameStr = Name.str(); + S << "_Z" << static_cast<int>(NameStr.size()) << NameStr; + + ItaniumMangler Mangler(true); + ParamIterator I(Leads, manglingRules[FuncId]); + Param P; + while ((P = I.getNextParam()).ArgType != 0) + Mangler(S, P); + return S.str(); +} + +/////////////////////////////////////////////////////////////////////////////// +// Misc + +static Type* getIntrinsicParamType( + LLVMContext& C, + const AMDGPULibFunc::Param& P, + bool useAddrSpace) { + Type* T = nullptr; + switch (P.ArgType) { + case AMDGPULibFunc::U8: + case AMDGPULibFunc::I8: T = Type::getInt8Ty(C); break; + case AMDGPULibFunc::U16: + case AMDGPULibFunc::I16: T = Type::getInt16Ty(C); break; + case AMDGPULibFunc::U32: + case AMDGPULibFunc::I32: T = Type::getInt32Ty(C); break; + case AMDGPULibFunc::U64: + case AMDGPULibFunc::I64: T = Type::getInt64Ty(C); break; + case AMDGPULibFunc::F16: T = Type::getHalfTy(C); break; + case AMDGPULibFunc::F32: T = Type::getFloatTy(C); break; + case AMDGPULibFunc::F64: T = Type::getDoubleTy(C); break; + + case AMDGPULibFunc::IMG1DA: + case AMDGPULibFunc::IMG1DB: + case AMDGPULibFunc::IMG2DA: + case AMDGPULibFunc::IMG1D: + case AMDGPULibFunc::IMG2D: + case AMDGPULibFunc::IMG3D: + T = StructType::create(C,"ocl_image")->getPointerTo(); break; + case AMDGPULibFunc::SAMPLER: + T = StructType::create(C,"ocl_sampler")->getPointerTo(); break; + case AMDGPULibFunc::EVENT: + T = StructType::create(C,"ocl_event")->getPointerTo(); break; + default: + llvm_unreachable("Unhandeled param type"); + return nullptr; + } + if (P.VectorSize > 1) + T = VectorType::get(T, P.VectorSize); + if (P.PtrKind != AMDGPULibFunc::BYVALUE) + T = useAddrSpace ? T->getPointerTo((P.PtrKind & AMDGPULibFunc::ADDR_SPACE) + - 1) + : T->getPointerTo(); + return T; +} + +FunctionType *AMDGPUMangledLibFunc::getFunctionType(Module &M) const { + LLVMContext& C = M.getContext(); + std::vector<Type*> Args; + ParamIterator I(Leads, manglingRules[FuncId]); + Param P; + while ((P=I.getNextParam()).ArgType != 0) + Args.push_back(getIntrinsicParamType(C, P, true)); + + return FunctionType::get( + getIntrinsicParamType(C, getRetType(FuncId, Leads), true), + Args, false); +} + +unsigned AMDGPUMangledLibFunc::getNumArgs() const { + return manglingRules[FuncId].getNumArgs(); +} + +unsigned AMDGPUUnmangledLibFunc::getNumArgs() const { + return UnmangledFuncInfo::getNumArgs(FuncId); +} + +std::string AMDGPUMangledLibFunc::getName() const { + SmallString<128> Buf; + raw_svector_ostream OS(Buf); + writeName(OS); + return OS.str(); +} + +Function *AMDGPULibFunc::getFunction(Module *M, const AMDGPULibFunc &fInfo) { + std::string FuncName = fInfo.mangle(); + Function *F = dyn_cast_or_null<Function>( + M->getValueSymbolTable().lookup(FuncName)); + + // check formal with actual types conformance + if (F && !F->isDeclaration() + && !F->isVarArg() + && F->arg_size() == fInfo.getNumArgs()) { + return F; + } + return nullptr; +} + +Function *AMDGPULibFunc::getOrInsertFunction(Module *M, + const AMDGPULibFunc &fInfo) { + std::string const FuncName = fInfo.mangle(); + Function *F = dyn_cast_or_null<Function>( + M->getValueSymbolTable().lookup(FuncName)); + + // check formal with actual types conformance + if (F && !F->isDeclaration() + && !F->isVarArg() + && F->arg_size() == fInfo.getNumArgs()) { + return F; + } + + FunctionType *FuncTy = fInfo.getFunctionType(*M); + + bool hasPtr = false; + for (FunctionType::param_iterator + PI = FuncTy->param_begin(), + PE = FuncTy->param_end(); + PI != PE; ++PI) { + const Type* argTy = static_cast<const Type*>(*PI); + if (argTy->isPointerTy()) { + hasPtr = true; + break; + } + } + + Constant *C = nullptr; + if (hasPtr) { + // Do not set extra attributes for functions with pointer arguments. + C = M->getOrInsertFunction(FuncName, FuncTy); + } else { + AttributeList Attr; + LLVMContext &Ctx = M->getContext(); + Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::ReadOnly); + Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::NoUnwind); + C = M->getOrInsertFunction(FuncName, FuncTy, Attr); + } + + return cast<Function>(C); +} + +bool UnmangledFuncInfo::lookup(StringRef Name, ID &Id) { + auto Loc = Map.find(Name); + if (Loc != Map.end()) { + Id = toFuncId(Loc->second); + return true; + } + Id = AMDGPULibFunc::EI_NONE; + return false; +} + +AMDGPULibFunc::AMDGPULibFunc(const AMDGPULibFunc &F) { + if (auto *MF = dyn_cast<AMDGPUMangledLibFunc>(F.Impl.get())) + Impl.reset(new AMDGPUMangledLibFunc(*MF)); + else if (auto *UMF = dyn_cast<AMDGPUUnmangledLibFunc>(F.Impl.get())) + Impl.reset(new AMDGPUUnmangledLibFunc(*UMF)); + else + Impl = std::unique_ptr<AMDGPULibFuncImpl>(); +} + +AMDGPULibFunc &AMDGPULibFunc::operator=(const AMDGPULibFunc &F) { + if (this == &F) + return *this; + new (this) AMDGPULibFunc(F); + return *this; +} + +AMDGPULibFunc::AMDGPULibFunc(EFuncId Id, const AMDGPULibFunc &CopyFrom) { + assert(AMDGPULibFuncBase::isMangled(Id) && CopyFrom.isMangled() && + "not supported"); + Impl.reset(new AMDGPUMangledLibFunc( + Id, *cast<AMDGPUMangledLibFunc>(CopyFrom.Impl.get()))); +} + +AMDGPULibFunc::AMDGPULibFunc(StringRef Name, FunctionType *FT) { + Impl.reset(new AMDGPUUnmangledLibFunc(Name, FT)); +} + +void AMDGPULibFunc::initMangled() { Impl.reset(new AMDGPUMangledLibFunc()); } + +AMDGPULibFunc::Param *AMDGPULibFunc::getLeads() { + if (!Impl) + initMangled(); + return cast<AMDGPUMangledLibFunc>(Impl.get())->Leads; +} + +const AMDGPULibFunc::Param *AMDGPULibFunc::getLeads() const { + return cast<const AMDGPUMangledLibFunc>(Impl.get())->Leads; +} diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.h b/lib/Target/AMDGPU/AMDGPULibFunc.h new file mode 100644 index 000000000000..5405bc645714 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPULibFunc.h @@ -0,0 +1,459 @@ +//===-- AMDGPULibFunc.h ---------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef _AMDGPU_LIBFUNC_H_ +#define _AMDGPU_LIBFUNC_H_ + +#include "llvm/ADT/StringRef.h" + +namespace llvm { + +class FunctionType; +class Function; +class Module; + +class AMDGPULibFuncBase { +public: + enum EFuncId { + EI_NONE, + + // IMPORTANT: enums below should go in ascending by 1 value order + // because they are used as indexes in the mangling rules table. + // don't use explicit value assignment. + // + // There are two types of library functions: those with mangled + // name and those with unmangled name. The enums for the library + // functions with mangled name are defined before enums for the + // library functions with unmangled name. The enum for the last + // library function with mangled name is EI_LAST_MANGLED. + // + // Library functions with mangled name. + EI_ABS, + EI_ABS_DIFF, + EI_ACOS, + EI_ACOSH, + EI_ACOSPI, + EI_ADD_SAT, + EI_ALL, + EI_ANY, + EI_ASIN, + EI_ASINH, + EI_ASINPI, + EI_ASYNC_WORK_GROUP_COPY, + EI_ASYNC_WORK_GROUP_STRIDED_COPY, + EI_ATAN, + EI_ATAN2, + EI_ATAN2PI, + EI_ATANH, + EI_ATANPI, + EI_ATOMIC_ADD, + EI_ATOMIC_AND, + EI_ATOMIC_CMPXCHG, + EI_ATOMIC_DEC, + EI_ATOMIC_INC, + EI_ATOMIC_MAX, + EI_ATOMIC_MIN, + EI_ATOMIC_OR, + EI_ATOMIC_SUB, + EI_ATOMIC_XCHG, + EI_ATOMIC_XOR, + EI_BITSELECT, + EI_CBRT, + EI_CEIL, + EI_CLAMP, + EI_CLZ, + EI_COMMIT_READ_PIPE, + EI_COMMIT_WRITE_PIPE, + EI_COPYSIGN, + EI_COS, + EI_COSH, + EI_COSPI, + EI_CROSS, + EI_CTZ, + EI_DEGREES, + EI_DISTANCE, + EI_DIVIDE, + EI_DOT, + EI_ERF, + EI_ERFC, + EI_EXP, + EI_EXP10, + EI_EXP2, + EI_EXPM1, + EI_FABS, + EI_FAST_DISTANCE, + EI_FAST_LENGTH, + EI_FAST_NORMALIZE, + EI_FDIM, + EI_FLOOR, + EI_FMA, + EI_FMAX, + EI_FMIN, + EI_FMOD, + EI_FRACT, + EI_FREXP, + EI_GET_IMAGE_ARRAY_SIZE, + EI_GET_IMAGE_CHANNEL_DATA_TYPE, + EI_GET_IMAGE_CHANNEL_ORDER, + EI_GET_IMAGE_DIM, + EI_GET_IMAGE_HEIGHT, + EI_GET_IMAGE_WIDTH, + EI_GET_PIPE_MAX_PACKETS, + EI_GET_PIPE_NUM_PACKETS, + EI_HADD, + EI_HYPOT, + EI_ILOGB, + EI_ISEQUAL, + EI_ISFINITE, + EI_ISGREATER, + EI_ISGREATEREQUAL, + EI_ISINF, + EI_ISLESS, + EI_ISLESSEQUAL, + EI_ISLESSGREATER, + EI_ISNAN, + EI_ISNORMAL, + EI_ISNOTEQUAL, + EI_ISORDERED, + EI_ISUNORDERED, + EI_LDEXP, + EI_LENGTH, + EI_LGAMMA, + EI_LGAMMA_R, + EI_LOG, + EI_LOG10, + EI_LOG1P, + EI_LOG2, + EI_LOGB, + EI_MAD, + EI_MAD24, + EI_MAD_HI, + EI_MAD_SAT, + EI_MAX, + EI_MAXMAG, + EI_MIN, + EI_MINMAG, + EI_MIX, + EI_MODF, + EI_MUL24, + EI_MUL_HI, + EI_NAN, + EI_NEXTAFTER, + EI_NORMALIZE, + EI_POPCOUNT, + EI_POW, + EI_POWN, + EI_POWR, + EI_PREFETCH, + EI_RADIANS, + EI_RECIP, + EI_REMAINDER, + EI_REMQUO, + EI_RESERVE_READ_PIPE, + EI_RESERVE_WRITE_PIPE, + EI_RHADD, + EI_RINT, + EI_ROOTN, + EI_ROTATE, + EI_ROUND, + EI_RSQRT, + EI_SELECT, + EI_SHUFFLE, + EI_SHUFFLE2, + EI_SIGN, + EI_SIGNBIT, + EI_SIN, + EI_SINCOS, + EI_SINH, + EI_SINPI, + EI_SMOOTHSTEP, + EI_SQRT, + EI_STEP, + EI_SUB_GROUP_BROADCAST, + EI_SUB_GROUP_COMMIT_READ_PIPE, + EI_SUB_GROUP_COMMIT_WRITE_PIPE, + EI_SUB_GROUP_REDUCE_ADD, + EI_SUB_GROUP_REDUCE_MAX, + EI_SUB_GROUP_REDUCE_MIN, + EI_SUB_GROUP_RESERVE_READ_PIPE, + EI_SUB_GROUP_RESERVE_WRITE_PIPE, + EI_SUB_GROUP_SCAN_EXCLUSIVE_ADD, + EI_SUB_GROUP_SCAN_EXCLUSIVE_MAX, + EI_SUB_GROUP_SCAN_EXCLUSIVE_MIN, + EI_SUB_GROUP_SCAN_INCLUSIVE_ADD, + EI_SUB_GROUP_SCAN_INCLUSIVE_MAX, + EI_SUB_GROUP_SCAN_INCLUSIVE_MIN, + EI_SUB_SAT, + EI_TAN, + EI_TANH, + EI_TANPI, + EI_TGAMMA, + EI_TRUNC, + EI_UPSAMPLE, + EI_VEC_STEP, + EI_VSTORE, + EI_VSTORE16, + EI_VSTORE2, + EI_VSTORE3, + EI_VSTORE4, + EI_VSTORE8, + EI_WORK_GROUP_COMMIT_READ_PIPE, + EI_WORK_GROUP_COMMIT_WRITE_PIPE, + EI_WORK_GROUP_REDUCE_ADD, + EI_WORK_GROUP_REDUCE_MAX, + EI_WORK_GROUP_REDUCE_MIN, + EI_WORK_GROUP_RESERVE_READ_PIPE, + EI_WORK_GROUP_RESERVE_WRITE_PIPE, + EI_WORK_GROUP_SCAN_EXCLUSIVE_ADD, + EI_WORK_GROUP_SCAN_EXCLUSIVE_MAX, + EI_WORK_GROUP_SCAN_EXCLUSIVE_MIN, + EI_WORK_GROUP_SCAN_INCLUSIVE_ADD, + EI_WORK_GROUP_SCAN_INCLUSIVE_MAX, + EI_WORK_GROUP_SCAN_INCLUSIVE_MIN, + EI_WRITE_IMAGEF, + EI_WRITE_IMAGEI, + EI_WRITE_IMAGEUI, + EI_NCOS, + EI_NEXP2, + EI_NFMA, + EI_NLOG2, + EI_NRCP, + EI_NRSQRT, + EI_NSIN, + EI_NSQRT, + EI_FTZ, + EI_FLDEXP, + EI_CLASS, + EI_RCBRT, + EI_LAST_MANGLED = + EI_RCBRT, /* The last library function with mangled name */ + + // Library functions with unmangled name. + EI_READ_PIPE_2, + EI_READ_PIPE_4, + EI_WRITE_PIPE_2, + EI_WRITE_PIPE_4, + + EX_INTRINSICS_COUNT + }; + + enum ENamePrefix { + NOPFX, + NATIVE, + HALF + }; + + enum EType { + B8 = 1, + B16 = 2, + B32 = 3, + B64 = 4, + SIZE_MASK = 7, + FLOAT = 0x10, + INT = 0x20, + UINT = 0x30, + BASE_TYPE_MASK = 0x30, + U8 = UINT | B8, + U16 = UINT | B16, + U32 = UINT | B32, + U64 = UINT | B64, + I8 = INT | B8, + I16 = INT | B16, + I32 = INT | B32, + I64 = INT | B64, + F16 = FLOAT | B16, + F32 = FLOAT | B32, + F64 = FLOAT | B64, + IMG1DA = 0x80, + IMG1DB, + IMG2DA, + IMG1D, + IMG2D, + IMG3D, + SAMPLER, + EVENT, + DUMMY + }; + + enum EPtrKind { + BYVALUE = 0, + ADDR_SPACE = 0xF, // Address space takes value 0x1 ~ 0xF. + CONST = 0x10, + VOLATILE = 0x20 + }; + + struct Param { + unsigned char ArgType; + unsigned char VectorSize; + unsigned char PtrKind; + + unsigned char Reserved; + + void reset() { + ArgType = 0; + VectorSize = 1; + PtrKind = 0; + } + Param() { reset(); } + + template <typename Stream> + void mangleItanium(Stream& os); + }; + static bool isMangled(EFuncId Id) { + return static_cast<unsigned>(Id) <= static_cast<unsigned>(EI_LAST_MANGLED); + } + + static unsigned getEPtrKindFromAddrSpace(unsigned AS) { + assert(((AS + 1) & ~ADDR_SPACE) == 0); + return AS + 1; + } + + static unsigned getAddrSpaceFromEPtrKind(unsigned Kind) { + Kind = Kind & ADDR_SPACE; + assert(Kind >= 1); + return Kind - 1; + } +}; + +class AMDGPULibFuncImpl : public AMDGPULibFuncBase { +public: + AMDGPULibFuncImpl() {} + virtual ~AMDGPULibFuncImpl() {} + + /// Get unmangled name for mangled library function and name for unmangled + /// library function. + virtual std::string getName() const = 0; + virtual unsigned getNumArgs() const = 0; + EFuncId getId() const { return FuncId; } + ENamePrefix getPrefix() const { return FKind; } + + bool isMangled() const { return AMDGPULibFuncBase::isMangled(FuncId); } + + void setId(EFuncId id) { FuncId = id; } + virtual bool parseFuncName(StringRef &mangledName) = 0; + + /// \return The mangled function name for mangled library functions + /// and unmangled function name for unmangled library functions. + virtual std::string mangle() const = 0; + + void setName(StringRef N) { Name = N; } + void setPrefix(ENamePrefix pfx) { FKind = pfx; } + + virtual FunctionType *getFunctionType(Module &M) const = 0; + +protected: + EFuncId FuncId; + std::string Name; + ENamePrefix FKind; +}; + +/// Wrapper class for AMDGPULIbFuncImpl +class AMDGPULibFunc : public AMDGPULibFuncBase { +public: + explicit AMDGPULibFunc() : Impl(std::unique_ptr<AMDGPULibFuncImpl>()) {} + AMDGPULibFunc(const AMDGPULibFunc &F); + /// Clone a mangled library func with the Id \p Id and argument info from \p + /// CopyFrom. + explicit AMDGPULibFunc(EFuncId Id, const AMDGPULibFunc &CopyFrom); + /// Construct an unmangled library function on the fly. + explicit AMDGPULibFunc(StringRef FName, FunctionType *FT); + + AMDGPULibFunc &operator=(const AMDGPULibFunc &F); + + /// Get unmangled name for mangled library function and name for unmangled + /// library function. + std::string getName() const { return Impl->getName(); } + unsigned getNumArgs() const { return Impl->getNumArgs(); } + EFuncId getId() const { return Impl->getId(); } + ENamePrefix getPrefix() const { return Impl->getPrefix(); } + /// Get leading parameters for mangled lib functions. + Param *getLeads(); + const Param *getLeads() const; + + bool isMangled() const { return Impl->isMangled(); } + void setId(EFuncId Id) { Impl->setId(Id); } + bool parseFuncName(StringRef &MangledName) { + return Impl->parseFuncName(MangledName); + } + + /// \return The mangled function name for mangled library functions + /// and unmangled function name for unmangled library functions. + std::string mangle() const { return Impl->mangle(); } + + void setName(StringRef N) { Impl->setName(N); } + void setPrefix(ENamePrefix PFX) { Impl->setPrefix(PFX); } + + FunctionType *getFunctionType(Module &M) const { + return Impl->getFunctionType(M); + } + static Function *getFunction(llvm::Module *M, const AMDGPULibFunc &fInfo); + + static Function *getOrInsertFunction(llvm::Module *M, + const AMDGPULibFunc &fInfo); + static bool parse(StringRef MangledName, AMDGPULibFunc &Ptr); + +private: + /// Initialize as a mangled library function. + void initMangled(); + std::unique_ptr<AMDGPULibFuncImpl> Impl; +}; + +class AMDGPUMangledLibFunc : public AMDGPULibFuncImpl { +public: + Param Leads[2]; + + explicit AMDGPUMangledLibFunc(); + explicit AMDGPUMangledLibFunc(EFuncId id, + const AMDGPUMangledLibFunc ©From); + + std::string getName() const override; + unsigned getNumArgs() const override; + FunctionType *getFunctionType(Module &M) const override; + static StringRef getUnmangledName(StringRef MangledName); + + bool parseFuncName(StringRef &mangledName) override; + + // Methods for support type inquiry through isa, cast, and dyn_cast: + static bool classof(const AMDGPULibFuncImpl *F) { return F->isMangled(); } + + std::string mangle() const override; + +private: + std::string mangleNameItanium() const; + + std::string mangleName(StringRef Name) const; + bool parseUnmangledName(StringRef MangledName); + + template <typename Stream> void writeName(Stream &OS) const; +}; + +class AMDGPUUnmangledLibFunc : public AMDGPULibFuncImpl { + FunctionType *FuncTy; + +public: + explicit AMDGPUUnmangledLibFunc(); + explicit AMDGPUUnmangledLibFunc(StringRef FName, FunctionType *FT) { + Name = FName; + FuncTy = FT; + } + std::string getName() const override { return Name; } + unsigned getNumArgs() const override; + FunctionType *getFunctionType(Module &M) const override { return FuncTy; } + + bool parseFuncName(StringRef &Name) override; + + // Methods for support type inquiry through isa, cast, and dyn_cast: + static bool classof(const AMDGPULibFuncImpl *F) { return !F->isMangled(); } + + std::string mangle() const override { return Name; } + + void setFunctionType(FunctionType *FT) { FuncTy = FT; } +}; +} +#endif // _AMDGPU_LIBFUNC_H_ diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 63dd0d726d91..23fd8113932c 100644 --- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -121,21 +121,39 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO, MCOp = MCOperand::createExpr(Expr); return true; } + case MachineOperand::MO_RegisterMask: + // Regmasks are like implicit defs. + return false; } } void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { unsigned Opcode = MI->getOpcode(); + const auto *TII = ST.getInstrInfo(); // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We // need to select it to the subtarget specific version, and there's no way to // do that with a single pseudo source operation. if (Opcode == AMDGPU::S_SETPC_B64_return) Opcode = AMDGPU::S_SETPC_B64; + else if (Opcode == AMDGPU::SI_CALL) { + // SI_CALL is just S_SWAPPC_B64 with an additional operand to track the + // called function (which we need to remove here). + OutMI.setOpcode(TII->pseudoToMCOpcode(AMDGPU::S_SWAPPC_B64)); + MCOperand Dest, Src; + lowerOperand(MI->getOperand(0), Dest); + lowerOperand(MI->getOperand(1), Src); + OutMI.addOperand(Dest); + OutMI.addOperand(Src); + return; + } else if (Opcode == AMDGPU::SI_TCRETURN) { + // TODO: How to use branch immediate and avoid register+add? + Opcode = AMDGPU::S_SETPC_B64; + } - int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(Opcode); + int MCOpcode = TII->pseudoToMCOpcode(Opcode); if (MCOpcode == -1) { - LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext(); + LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext(); C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have " "a target-specific version: " + Twine(MI->getOpcode())); } @@ -187,7 +205,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { StringRef Err; if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) { - LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext(); + LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext(); C.emitError("Illegal instruction detected: " + Err); MI->print(errs()); } @@ -212,7 +230,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext); Expr->print(Str, MAI); - OutStreamer->emitRawComment(" mask branch " + BBStr); + OutStreamer->emitRawComment(Twine(" mask branch ") + BBStr); } return; diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp index 9a391d06c9ea..20918233e447 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp @@ -14,46 +14,55 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/CFG.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegionInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DebugLoc.h" +#include "llvm/Pass.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include <cassert> #include <tuple> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "amdgpucfgstructurizer" namespace { + class PHILinearizeDestIterator; class PHILinearize { friend class PHILinearizeDestIterator; public: - typedef std::pair<unsigned, MachineBasicBlock *> PHISourceT; + using PHISourceT = std::pair<unsigned, MachineBasicBlock *>; private: - typedef DenseSet<PHISourceT> PHISourcesT; - typedef struct { + using PHISourcesT = DenseSet<PHISourceT>; + using PHIInfoElementT = struct { unsigned DestReg; DebugLoc DL; PHISourcesT Sources; - } PHIInfoElementT; - typedef SmallPtrSet<PHIInfoElementT *, 2> PHIInfoT; + }; + using PHIInfoT = SmallPtrSet<PHIInfoElementT *, 2>; PHIInfoT PHIInfo; static unsigned phiInfoElementGetDest(PHIInfoElementT *Info); @@ -85,8 +94,8 @@ public: void dump(MachineRegisterInfo *MRI); void clear(); - typedef PHISourcesT::iterator source_iterator; - typedef PHILinearizeDestIterator dest_iterator; + using source_iterator = PHISourcesT::iterator; + using dest_iterator = PHILinearizeDestIterator; dest_iterator dests_begin(); dest_iterator dests_end(); @@ -100,6 +109,8 @@ private: PHILinearize::PHIInfoT::iterator Iter; public: + PHILinearizeDestIterator(PHILinearize::PHIInfoT::iterator I) : Iter(I) {} + unsigned operator*() { return PHILinearize::phiInfoElementGetDest(*Iter); } PHILinearizeDestIterator &operator++() { ++Iter; @@ -111,10 +122,10 @@ public: bool operator!=(const PHILinearizeDestIterator &I) const { return I.Iter != Iter; } - - PHILinearizeDestIterator(PHILinearize::PHIInfoT::iterator I) : Iter(I) {} }; +} // end anonymous namespace + unsigned PHILinearize::phiInfoElementGetDest(PHIInfoElementT *Info) { return Info->DestReg; } @@ -250,21 +261,23 @@ unsigned PHILinearize::getNumSources(unsigned DestReg) { return phiInfoElementGetSources(findPHIInfoElement(DestReg)).size(); } -void PHILinearize::dump(MachineRegisterInfo *MRI) { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void PHILinearize::dump(MachineRegisterInfo *MRI) { const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); dbgs() << "=PHIInfo Start=\n"; for (auto PII : this->PHIInfo) { PHIInfoElementT &Element = *PII; - dbgs() << "Dest: " << PrintReg(Element.DestReg, TRI) + dbgs() << "Dest: " << printReg(Element.DestReg, TRI) << " Sources: {"; for (auto &SI : Element.Sources) { - dbgs() << PrintReg(SI.first, TRI) << "(BB#" - << SI.second->getNumber() << "),"; + dbgs() << printReg(SI.first, TRI) << '(' << printMBBReference(*SI.second) + << "),"; } dbgs() << "}\n"; } dbgs() << "=PHIInfo End=\n"; } +#endif void PHILinearize::clear() { PHIInfo = PHIInfoT(); } @@ -280,14 +293,12 @@ PHILinearize::source_iterator PHILinearize::sources_begin(unsigned Reg) { auto InfoElement = findPHIInfoElement(Reg); return phiInfoElementGetSources(InfoElement).begin(); } + PHILinearize::source_iterator PHILinearize::sources_end(unsigned Reg) { auto InfoElement = findPHIInfoElement(Reg); return phiInfoElementGetSources(InfoElement).end(); } -class RegionMRT; -class MBBMRT; - static unsigned getPHINumInputs(MachineInstr &PHI) { assert(PHI.isPHI()); return (PHI.getNumOperands() - 1) / 2; @@ -313,6 +324,11 @@ static unsigned getPHIDestReg(MachineInstr &PHI) { return PHI.getOperand(0).getReg(); } +namespace { + +class RegionMRT; +class MBBMRT; + class LinearizedRegion { protected: MachineBasicBlock *Entry; @@ -347,6 +363,11 @@ protected: RegionMRT *TopRegion = nullptr); public: + LinearizedRegion(); + LinearizedRegion(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, PHILinearize &PHIInfo); + ~LinearizedRegion() = default; + void setRegionMRT(RegionMRT *Region) { RMRT = Region; } RegionMRT *getRegionMRT() { return RMRT; } @@ -411,13 +432,6 @@ public: void initLiveOut(RegionMRT *Region, const MachineRegisterInfo *MRI, const TargetRegisterInfo *TRI, PHILinearize &PHIInfo); - - LinearizedRegion(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI, - const TargetRegisterInfo *TRI, PHILinearize &PHIInfo); - - LinearizedRegion(); - - ~LinearizedRegion(); }; class MRT { @@ -427,6 +441,8 @@ protected: unsigned BBSelectRegOut; public: + virtual ~MRT() = default; + unsigned getBBSelectRegIn() { return BBSelectRegIn; } unsigned getBBSelectRegOut() { return BBSelectRegOut; } @@ -465,42 +481,55 @@ public: dbgs() << " "; } } - - virtual ~MRT() {} }; class MBBMRT : public MRT { MachineBasicBlock *MBB; public: - virtual MBBMRT *getMBBMRT() { return this; } + MBBMRT(MachineBasicBlock *BB) : MBB(BB) { + setParent(nullptr); + setBBSelectRegOut(0); + setBBSelectRegIn(0); + } + + MBBMRT *getMBBMRT() override { return this; } MachineBasicBlock *getMBB() { return MBB; } - virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) { + void dump(const TargetRegisterInfo *TRI, int depth = 0) override { dumpDepth(depth); dbgs() << "MBB: " << getMBB()->getNumber(); - dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI); - dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n"; - } - - MBBMRT(MachineBasicBlock *BB) : MBB(BB) { - setParent(nullptr); - setBBSelectRegOut(0); - setBBSelectRegIn(0); + dbgs() << " In: " << printReg(getBBSelectRegIn(), TRI); + dbgs() << ", Out: " << printReg(getBBSelectRegOut(), TRI) << "\n"; } }; class RegionMRT : public MRT { protected: MachineRegion *Region; - LinearizedRegion *LRegion; - MachineBasicBlock *Succ; - + LinearizedRegion *LRegion = nullptr; + MachineBasicBlock *Succ = nullptr; SetVector<MRT *> Children; public: - virtual RegionMRT *getRegionMRT() { return this; } + RegionMRT(MachineRegion *MachineRegion) : Region(MachineRegion) { + setParent(nullptr); + setBBSelectRegOut(0); + setBBSelectRegIn(0); + } + + ~RegionMRT() override { + if (LRegion) { + delete LRegion; + } + + for (auto CI : Children) { + delete &(*CI); + } + } + + RegionMRT *getRegionMRT() override { return this; } void setLinearizedRegion(LinearizedRegion *LinearizeRegion) { LRegion = LinearizeRegion; @@ -518,11 +547,11 @@ public: SetVector<MRT *> *getChildren() { return &Children; } - virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) { + void dump(const TargetRegisterInfo *TRI, int depth = 0) override { dumpDepth(depth); dbgs() << "Region: " << (void *)Region; - dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI); - dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n"; + dbgs() << " In: " << printReg(getBBSelectRegIn(), TRI); + dbgs() << ", Out: " << printReg(getBBSelectRegOut(), TRI) << "\n"; dumpDepth(depth); if (getSucc()) @@ -581,25 +610,10 @@ public: } } } - - RegionMRT(MachineRegion *MachineRegion) - : Region(MachineRegion), LRegion(nullptr), Succ(nullptr) { - setParent(nullptr); - setBBSelectRegOut(0); - setBBSelectRegIn(0); - } - - virtual ~RegionMRT() { - if (LRegion) { - delete LRegion; - } - - for (auto CI : Children) { - delete &(*CI); - } - } }; +} // end anonymous namespace + static unsigned createBBSelectReg(const SIInstrInfo *TII, MachineRegisterInfo *MRI) { return MRI->createVirtualRegister(TII->getPreferredSelectRegClass(32)); @@ -644,7 +658,7 @@ RegionMRT *MRT::buildMRT(MachineFunction &MF, continue; } - DEBUG(dbgs() << "Visiting BB#" << MBB->getNumber() << "\n"); + DEBUG(dbgs() << "Visiting " << printMBBReference(*MBB) << "\n"); MBBMRT *NewMBB = new MBBMRT(MBB); MachineRegion *Region = RegionInfo->getRegionFor(MBB); @@ -681,18 +695,18 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg, const TargetRegisterInfo *TRI, PHILinearize &PHIInfo) { if (TRI->isVirtualRegister(Reg)) { - DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n"); + DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) << "\n"); // If this is a source register to a PHI we are chaining, it // must be live out. if (PHIInfo.isSource(Reg)) { - DEBUG(dbgs() << "Add LiveOut (PHI): " << PrintReg(Reg, TRI) << "\n"); + DEBUG(dbgs() << "Add LiveOut (PHI): " << printReg(Reg, TRI) << "\n"); addLiveOut(Reg); } else { // If this is live out of the MBB for (auto &UI : MRI->use_operands(Reg)) { if (UI.getParent()->getParent() != MBB) { - DEBUG(dbgs() << "Add LiveOut (MBB BB#" << MBB->getNumber() - << "): " << PrintReg(Reg, TRI) << "\n"); + DEBUG(dbgs() << "Add LiveOut (MBB " << printMBBReference(*MBB) + << "): " << printReg(Reg, TRI) << "\n"); addLiveOut(Reg); } else { // If the use is in the same MBB we have to make sure @@ -703,7 +717,7 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg, MIE = UseInstr->getParent()->instr_end(); MII != MIE; ++MII) { if ((&(*MII)) == DefInstr) { - DEBUG(dbgs() << "Add LiveOut (Loop): " << PrintReg(Reg, TRI) + DEBUG(dbgs() << "Add LiveOut (Loop): " << printReg(Reg, TRI) << "\n"); addLiveOut(Reg); } @@ -720,11 +734,11 @@ void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg, const TargetRegisterInfo *TRI, PHILinearize &PHIInfo) { if (TRI->isVirtualRegister(Reg)) { - DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n"); + DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI) << "\n"); for (auto &UI : MRI->use_operands(Reg)) { if (!Region->contains(UI.getParent()->getParent())) { DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region - << "): " << PrintReg(Reg, TRI) << "\n"); + << "): " << printReg(Reg, TRI) << "\n"); addLiveOut(Reg); } } @@ -735,7 +749,8 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI, const TargetRegisterInfo *TRI, PHILinearize &PHIInfo) { - DEBUG(dbgs() << "-Store Live Outs Begin (BB#" << MBB->getNumber() << ")-\n"); + DEBUG(dbgs() << "-Store Live Outs Begin (" << printMBBReference(*MBB) + << ")-\n"); for (auto &II : *MBB) { for (auto &RI : II.defs()) { storeLiveOutReg(MBB, RI.getReg(), RI.getParent(), MRI, TRI, PHIInfo); @@ -759,9 +774,9 @@ void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB, for (int i = 0; i < numPreds; ++i) { if (getPHIPred(PHI, i) == MBB) { unsigned PHIReg = getPHISourceReg(PHI, i); - DEBUG(dbgs() << "Add LiveOut (PhiSource BB#" << MBB->getNumber() - << " -> BB#" << (*SI)->getNumber() - << "): " << PrintReg(PHIReg, TRI) << "\n"); + DEBUG(dbgs() << "Add LiveOut (PhiSource " << printMBBReference(*MBB) + << " -> " << printMBBReference(*(*SI)) + << "): " << printReg(PHIReg, TRI) << "\n"); addLiveOut(PHIReg); } } @@ -830,7 +845,7 @@ void LinearizedRegion::storeLiveOuts(RegionMRT *Region, if (Region->contains(getPHIPred(PHI, i))) { unsigned PHIReg = getPHISourceReg(PHI, i); DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region - << "): " << PrintReg(PHIReg, TRI) << "\n"); + << "): " << printReg(PHIReg, TRI) << "\n"); addLiveOut(PHIReg); } } @@ -839,6 +854,7 @@ void LinearizedRegion::storeLiveOuts(RegionMRT *Region, } } +#ifndef NDEBUG void LinearizedRegion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) { OS << "Linearized Region {"; bool IsFirst = true; @@ -852,13 +868,14 @@ void LinearizedRegion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) { } OS << "} (" << Entry->getNumber() << ", " << (Exit == nullptr ? -1 : Exit->getNumber()) - << "): In:" << PrintReg(getBBSelectRegIn(), TRI) - << " Out:" << PrintReg(getBBSelectRegOut(), TRI) << " {"; + << "): In:" << printReg(getBBSelectRegIn(), TRI) + << " Out:" << printReg(getBBSelectRegOut(), TRI) << " {"; for (auto &LI : LiveOuts) { - OS << PrintReg(LI, TRI) << " "; + OS << printReg(LI, TRI) << " "; } OS << "} \n"; } +#endif unsigned LinearizedRegion::getBBSelectRegIn() { return getRegionMRT()->getBBSelectRegIn(); @@ -893,8 +910,8 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister, assert(Register != NewRegister && "Cannot replace a reg with itself"); DEBUG(dbgs() << "Pepareing to replace register (region): " - << PrintReg(Register, MRI->getTargetRegisterInfo()) << " with " - << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n"); + << printReg(Register, MRI->getTargetRegisterInfo()) << " with " + << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n"); // If we are replacing outside, we also need to update the LiveOuts if (ReplaceOutside && @@ -930,14 +947,14 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister, if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) { DEBUG(dbgs() << "Trying to substitute physical register: " - << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) + << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n"); llvm_unreachable("Cannot substitute physical registers"); } else { DEBUG(dbgs() << "Replacing register (region): " - << PrintReg(Register, MRI->getTargetRegisterInfo()) + << printReg(Register, MRI->getTargetRegisterInfo()) << " with " - << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) + << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n"); O.setReg(NewRegister); } @@ -1006,16 +1023,16 @@ void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) { continue; if (!MRI->hasOneDef(Reg)) { DEBUG(this->getEntry()->getParent()->dump()); - DEBUG(dbgs() << PrintReg(Reg, TRI) << "\n"); + DEBUG(dbgs() << printReg(Reg, TRI) << "\n"); } if (MRI->def_begin(Reg) == MRI->def_end()) { DEBUG(dbgs() << "Register " - << PrintReg(Reg, MRI->getTargetRegisterInfo()) + << printReg(Reg, MRI->getTargetRegisterInfo()) << " has NO defs\n"); } else if (!MRI->hasOneDef(Reg)) { DEBUG(dbgs() << "Register " - << PrintReg(Reg, MRI->getTargetRegisterInfo()) + << printReg(Reg, MRI->getTargetRegisterInfo()) << " has multiple defs\n"); } @@ -1025,7 +1042,7 @@ void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) { bool UseIsOutsideDefMBB = Def->getParent()->getParent() != MBB; if (UseIsOutsideDefMBB && UseOperand->isKill()) { DEBUG(dbgs() << "Removing kill flag on register: " - << PrintReg(Reg, TRI) << "\n"); + << printReg(Reg, TRI) << "\n"); UseOperand->setIsKill(false); } } @@ -1059,7 +1076,7 @@ LinearizedRegion::LinearizedRegion() { Parent = nullptr; } -LinearizedRegion::~LinearizedRegion() {} +namespace { class AMDGPUMachineCFGStructurizer : public MachineFunctionPass { private: @@ -1070,6 +1087,7 @@ private: unsigned BBSelectRegister; PHILinearize PHIInfo; DenseMap<MachineBasicBlock *, MachineBasicBlock *> FallthroughMap; + RegionMRT *RMRT; void getPHIRegionIndices(RegionMRT *Region, MachineInstr &PHI, SmallVector<unsigned, 2> &RegionIndices); @@ -1193,15 +1211,15 @@ private: public: static char ID; + AMDGPUMachineCFGStructurizer() : MachineFunctionPass(ID) { + initializeAMDGPUMachineCFGStructurizerPass(*PassRegistry::getPassRegistry()); + } + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineRegionInfoPass>(); MachineFunctionPass::getAnalysisUsage(AU); } - AMDGPUMachineCFGStructurizer() : MachineFunctionPass(ID) { - initializeAMDGPUMachineCFGStructurizerPass(*PassRegistry::getPassRegistry()); - } - void initFallthroughMap(MachineFunction &MF); void createLinearizedRegion(RegionMRT *Region, unsigned SelectOut); @@ -1210,14 +1228,14 @@ public: MachineRegisterInfo *MRI, const SIInstrInfo *TII); - RegionMRT *RMRT; void setRegionMRT(RegionMRT *RegionTree) { RMRT = RegionTree; } RegionMRT *getRegionMRT() { return RMRT; } bool runOnMachineFunction(MachineFunction &MF) override; }; -} + +} // end anonymous namespace char AMDGPUMachineCFGStructurizer::ID = 0; @@ -1254,7 +1272,6 @@ void AMDGPUMachineCFGStructurizer::transformSimpleIfRegion(RegionMRT *Region) { } static void fixMBBTerminator(MachineBasicBlock *MBB) { - if (MBB->succ_size() == 1) { auto *Succ = *(MBB->succ_begin()); for (auto &TI : MBB->terminators()) { @@ -1433,8 +1450,7 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI, unsigned *ReplaceReg) { DEBUG(dbgs() << "Shrink PHI: "); DEBUG(PHI.dump()); - DEBUG(dbgs() << " to " << PrintReg(getPHIDestReg(PHI), TRI) - << "<def> = PHI("); + DEBUG(dbgs() << " to " << printReg(getPHIDestReg(PHI), TRI) << " = PHI("); bool Replaced = false; unsigned NumInputs = getPHINumInputs(PHI); @@ -1464,8 +1480,8 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI, if (SourceMBB) { MIB.addReg(CombinedSourceReg); MIB.addMBB(SourceMBB); - DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#" - << SourceMBB->getNumber()); + DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", " + << printMBBReference(*SourceMBB)); } for (unsigned i = 0; i < NumInputs; ++i) { @@ -1476,8 +1492,8 @@ bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI, MachineBasicBlock *SourcePred = getPHIPred(PHI, i); MIB.addReg(SourceReg); MIB.addMBB(SourcePred); - DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#" - << SourcePred->getNumber()); + DEBUG(dbgs() << printReg(SourceReg, TRI) << ", " + << printMBBReference(*SourcePred)); } DEBUG(dbgs() << ")\n"); } @@ -1490,8 +1506,7 @@ void AMDGPUMachineCFGStructurizer::replacePHI( SmallVector<unsigned, 2> &PHIRegionIndices) { DEBUG(dbgs() << "Replace PHI: "); DEBUG(PHI.dump()); - DEBUG(dbgs() << " with " << PrintReg(getPHIDestReg(PHI), TRI) - << "<def> = PHI("); + DEBUG(dbgs() << " with " << printReg(getPHIDestReg(PHI), TRI) << " = PHI("); bool HasExternalEdge = false; unsigned NumInputs = getPHINumInputs(PHI); @@ -1508,8 +1523,8 @@ void AMDGPUMachineCFGStructurizer::replacePHI( getPHIDestReg(PHI)); MIB.addReg(CombinedSourceReg); MIB.addMBB(LastMerge); - DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#" - << LastMerge->getNumber()); + DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", " + << printMBBReference(*LastMerge)); for (unsigned i = 0; i < NumInputs; ++i) { if (isPHIRegionIndex(PHIRegionIndices, i)) { continue; @@ -1518,8 +1533,8 @@ void AMDGPUMachineCFGStructurizer::replacePHI( MachineBasicBlock *SourcePred = getPHIPred(PHI, i); MIB.addReg(SourceReg); MIB.addMBB(SourcePred); - DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#" - << SourcePred->getNumber()); + DEBUG(dbgs() << printReg(SourceReg, TRI) << ", " + << printMBBReference(*SourcePred)); } DEBUG(dbgs() << ")\n"); } else { @@ -1531,7 +1546,6 @@ void AMDGPUMachineCFGStructurizer::replacePHI( void AMDGPUMachineCFGStructurizer::replaceEntryPHI( MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *IfMBB, SmallVector<unsigned, 2> &PHIRegionIndices) { - DEBUG(dbgs() << "Replace entry PHI: "); DEBUG(PHI.dump()); DEBUG(dbgs() << " with "); @@ -1547,18 +1561,18 @@ void AMDGPUMachineCFGStructurizer::replaceEntryPHI( if (NumNonRegionInputs == 0) { auto DestReg = getPHIDestReg(PHI); replaceRegisterWith(DestReg, CombinedSourceReg); - DEBUG(dbgs() << " register " << PrintReg(CombinedSourceReg, TRI) << "\n"); + DEBUG(dbgs() << " register " << printReg(CombinedSourceReg, TRI) << "\n"); PHI.eraseFromParent(); } else { - DEBUG(dbgs() << PrintReg(getPHIDestReg(PHI), TRI) << "<def> = PHI("); + DEBUG(dbgs() << printReg(getPHIDestReg(PHI), TRI) << " = PHI("); MachineBasicBlock *MBB = PHI.getParent(); MachineInstrBuilder MIB = BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI), getPHIDestReg(PHI)); MIB.addReg(CombinedSourceReg); MIB.addMBB(IfMBB); - DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#" - << IfMBB->getNumber()); + DEBUG(dbgs() << printReg(CombinedSourceReg, TRI) << ", " + << printMBBReference(*IfMBB)); unsigned NumInputs = getPHINumInputs(PHI); for (unsigned i = 0; i < NumInputs; ++i) { if (isPHIRegionIndex(PHIRegionIndices, i)) { @@ -1568,8 +1582,8 @@ void AMDGPUMachineCFGStructurizer::replaceEntryPHI( MachineBasicBlock *SourcePred = getPHIPred(PHI, i); MIB.addReg(SourceReg); MIB.addMBB(SourcePred); - DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#" - << SourcePred->getNumber()); + DEBUG(dbgs() << printReg(SourceReg, TRI) << ", " + << printMBBReference(*SourcePred)); } DEBUG(dbgs() << ")\n"); PHI.eraseFromParent(); @@ -1593,7 +1607,7 @@ void AMDGPUMachineCFGStructurizer::replaceLiveOutRegs( } } - DEBUG(dbgs() << "Register " << PrintReg(Reg, TRI) << " is " + DEBUG(dbgs() << "Register " << printReg(Reg, TRI) << " is " << (IsDead ? "dead" : "alive") << " after PHI replace\n"); if (IsDead) { LRegion->removeLiveOut(Reg); @@ -1734,11 +1748,11 @@ void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB, if (MergeBB->succ_begin() == MergeBB->succ_end()) { return; } - DEBUG(dbgs() << "Merge PHI (BB#" << MergeBB->getNumber() - << "): " << PrintReg(DestRegister, TRI) << "<def> = PHI(" - << PrintReg(IfSourceRegister, TRI) << ", BB#" - << IfBB->getNumber() << PrintReg(CodeSourceRegister, TRI) - << ", BB#" << CodeBB->getNumber() << ")\n"); + DEBUG(dbgs() << "Merge PHI (" << printMBBReference(*MergeBB) + << "): " << printReg(DestRegister, TRI) << " = PHI(" + << printReg(IfSourceRegister, TRI) << ", " + << printMBBReference(*IfBB) << printReg(CodeSourceRegister, TRI) + << ", " << printMBBReference(*CodeBB) << ")\n"); const DebugLoc &DL = MergeBB->findDebugLoc(MergeBB->begin()); MachineInstrBuilder MIB = BuildMI(*MergeBB, MergeBB->instr_begin(), DL, TII->get(TargetOpcode::PHI), DestRegister); @@ -1796,8 +1810,8 @@ static void removeExternalCFGEdges(MachineBasicBlock *StartMBB, for (auto SI : Succs) { std::pair<MachineBasicBlock *, MachineBasicBlock *> Edge = SI; - DEBUG(dbgs() << "Removing edge: BB#" << Edge.first->getNumber() << " -> BB#" - << Edge.second->getNumber() << "\n"); + DEBUG(dbgs() << "Removing edge: " << printMBBReference(*Edge.first) + << " -> " << printMBBReference(*Edge.second) << "\n"); Edge.first->removeSuccessor(Edge.second); } } @@ -1835,8 +1849,8 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock( if (!CodeBBEnd->isSuccessor(MergeBB)) CodeBBEnd->addSuccessor(MergeBB); - DEBUG(dbgs() << "Moved MBB#" << CodeBBStart->getNumber() << " through MBB#" - << CodeBBEnd->getNumber() << "\n"); + DEBUG(dbgs() << "Moved " << printMBBReference(*CodeBBStart) << " through " + << printMBBReference(*CodeBBEnd) << "\n"); // If we have a single predecessor we can find a reasonable debug location MachineBasicBlock *SinglePred = @@ -1921,10 +1935,10 @@ void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *Co MachineInstr *AMDGPUMachineCFGStructurizer::getDefInstr(unsigned Reg) { if (MRI->def_begin(Reg) == MRI->def_end()) { - DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo()) + DEBUG(dbgs() << "Register " << printReg(Reg, MRI->getTargetRegisterInfo()) << " has NO defs\n"); } else if (!MRI->hasOneDef(Reg)) { - DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo()) + DEBUG(dbgs() << "Register " << printReg(Reg, MRI->getTargetRegisterInfo()) << " has multiple defs\n"); DEBUG(dbgs() << "DEFS BEGIN:\n"); for (auto DI = MRI->def_begin(Reg), DE = MRI->def_end(); DI != DE; ++DI) { @@ -2008,7 +2022,7 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB, } for (auto LI : OldLiveOuts) { - DEBUG(dbgs() << "LiveOut: " << PrintReg(LI, TRI)); + DEBUG(dbgs() << "LiveOut: " << printReg(LI, TRI)); if (!containsDef(CodeBB, InnerRegion, LI) || (!IsSingleBB && (getDefInstr(LI)->getParent() == LRegion->getExit()))) { // If the register simly lives through the CodeBB, we don't have @@ -2034,7 +2048,7 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB, unsigned IfSourceReg = MRI->createVirtualRegister(RegClass); // Create initializer, this value is never used, but is needed // to satisfy SSA. - DEBUG(dbgs() << "Initializer for reg: " << PrintReg(Reg) << "\n"); + DEBUG(dbgs() << "Initializer for reg: " << printReg(Reg) << "\n"); TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DebugLoc(), IfSourceReg, 0); @@ -2049,7 +2063,7 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB, // is a source block for a definition. SmallVector<unsigned, 4> Sources; if (PHIInfo.findSourcesFromMBB(CodeBB, Sources)) { - DEBUG(dbgs() << "Inserting PHI Live Out from BB#" << CodeBB->getNumber() + DEBUG(dbgs() << "Inserting PHI Live Out from " << printMBBReference(*CodeBB) << "\n"); for (auto SI : Sources) { unsigned DestReg; @@ -2131,7 +2145,7 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio const DebugLoc &DL = Entry->findDebugLoc(Entry->begin()); MachineInstrBuilder MIB = BuildMI(*Entry, Entry->instr_begin(), DL, TII->get(TargetOpcode::PHI), DestReg); - DEBUG(dbgs() << "Entry PHI " << PrintReg(DestReg, TRI) << "<def> = PHI("); + DEBUG(dbgs() << "Entry PHI " << printReg(DestReg, TRI) << " = PHI("); unsigned CurrentBackedgeReg = 0; @@ -2156,17 +2170,18 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio BackedgePHI.addMBB((*SRI).second); CurrentBackedgeReg = NewBackedgeReg; DEBUG(dbgs() << "Inserting backedge PHI: " - << PrintReg(NewBackedgeReg, TRI) << "<def> = PHI(" - << PrintReg(CurrentBackedgeReg, TRI) << ", BB#" - << getPHIPred(*PHIDefInstr, 0)->getNumber() << ", " - << PrintReg(getPHISourceReg(*PHIDefInstr, 1), TRI) - << ", BB#" << (*SRI).second->getNumber()); + << printReg(NewBackedgeReg, TRI) << " = PHI(" + << printReg(CurrentBackedgeReg, TRI) << ", " + << printMBBReference(*getPHIPred(*PHIDefInstr, 0)) + << ", " + << printReg(getPHISourceReg(*PHIDefInstr, 1), TRI) + << ", " << printMBBReference(*(*SRI).second)); } } else { MIB.addReg(SourceReg); MIB.addMBB((*SRI).second); - DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#" - << (*SRI).second->getNumber() << ", "); + DEBUG(dbgs() << printReg(SourceReg, TRI) << ", " + << printMBBReference(*(*SRI).second) << ", "); } } @@ -2174,8 +2189,8 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio if (CurrentBackedgeReg != 0) { MIB.addReg(CurrentBackedgeReg); MIB.addMBB(Exit); - DEBUG(dbgs() << PrintReg(CurrentBackedgeReg, TRI) << ", BB#" - << Exit->getNumber() << ")\n"); + DEBUG(dbgs() << printReg(CurrentBackedgeReg, TRI) << ", " + << printMBBReference(*Exit) << ")\n"); } else { DEBUG(dbgs() << ")\n"); } @@ -2205,7 +2220,7 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register, ++I; if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) { DEBUG(dbgs() << "Trying to substitute physical register: " - << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) + << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n"); llvm_unreachable("Cannot substitute physical registers"); // We don't handle physical registers, but if we need to @@ -2213,9 +2228,9 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register, // O.substPhysReg(NewRegister, *TRI); } else { DEBUG(dbgs() << "Replacing register: " - << PrintReg(Register, MRI->getTargetRegisterInfo()) + << printReg(Register, MRI->getTargetRegisterInfo()) << " with " - << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) + << printReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n"); O.setReg(NewRegister); } @@ -2233,11 +2248,11 @@ void AMDGPUMachineCFGStructurizer::resolvePHIInfos(MachineBasicBlock *FunctionEn for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE; ++DRI) { unsigned DestReg = *DRI; - DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI) << "\n"); + DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI) << "\n"); auto SRI = PHIInfo.sources_begin(DestReg); unsigned SourceReg = (*SRI).first; - DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI) - << " SourceReg: " << PrintReg(SourceReg, TRI) << "\n"); + DEBUG(dbgs() << "DestReg: " << printReg(DestReg, TRI) + << " SourceReg: " << printReg(SourceReg, TRI) << "\n"); assert(PHIInfo.sources_end(DestReg) == ++SRI && "More than one phi source in entry node"); @@ -2424,15 +2439,15 @@ void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI, MachineInstrBuilder MIB = BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(), TII->get(TargetOpcode::PHI), NewDestReg); - DEBUG(dbgs() << "Split Entry PHI " << PrintReg(NewDestReg, TRI) - << "<def> = PHI("); + DEBUG(dbgs() << "Split Entry PHI " << printReg(NewDestReg, TRI) << " = PHI("); MIB.addReg(PHISource); MIB.addMBB(Entry); - DEBUG(dbgs() << PrintReg(PHISource, TRI) << ", BB#" << Entry->getNumber()); + DEBUG(dbgs() << printReg(PHISource, TRI) << ", " + << printMBBReference(*Entry)); MIB.addReg(RegionSourceReg); MIB.addMBB(RegionSourceMBB); - DEBUG(dbgs() << " ," << PrintReg(RegionSourceReg, TRI) << ", BB#" - << RegionSourceMBB->getNumber() << ")\n"); + DEBUG(dbgs() << " ," << printReg(RegionSourceReg, TRI) << ", " + << printMBBReference(*RegionSourceMBB) << ")\n"); } void AMDGPUMachineCFGStructurizer::splitLoopPHIs(MachineBasicBlock *Entry, @@ -2487,7 +2502,6 @@ AMDGPUMachineCFGStructurizer::splitExit(LinearizedRegion *LRegion) { return NewExit; } - static MachineBasicBlock *split(MachineBasicBlock::iterator I) { // Create the fall-through block. MachineBasicBlock *MBB = (*I).getParent(); @@ -2514,9 +2528,9 @@ AMDGPUMachineCFGStructurizer::splitEntry(LinearizedRegion *LRegion) { MachineBasicBlock *EntrySucc = split(Entry->getFirstNonPHI()); MachineBasicBlock *Exit = LRegion->getExit(); - DEBUG(dbgs() << "Split BB#" << Entry->getNumber() << " to BB#" - << Entry->getNumber() << " -> BB#" << EntrySucc->getNumber() - << "\n"); + DEBUG(dbgs() << "Split " << printMBBReference(*Entry) << " to " + << printMBBReference(*Entry) << " -> " + << printMBBReference(*EntrySucc) << "\n"); LRegion->addMBB(EntrySucc); // Make the backedge go to Entry Succ @@ -2655,9 +2669,9 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) { BBSelectRegOut = Child->getBBSelectRegOut(); BBSelectRegIn = Child->getBBSelectRegIn(); - DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI) + DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI) << "\n"); - DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI) + DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI) << "\n"); MachineBasicBlock *IfEnd = CurrentMerge; @@ -2679,9 +2693,9 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) { BBSelectRegOut = Child->getBBSelectRegOut(); BBSelectRegIn = Child->getBBSelectRegIn(); - DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI) + DEBUG(dbgs() << "BBSelectRegIn: " << printReg(BBSelectRegIn, TRI) << "\n"); - DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI) + DEBUG(dbgs() << "BBSelectRegOut: " << printReg(BBSelectRegOut, TRI) << "\n"); MachineBasicBlock *IfEnd = CurrentMerge; @@ -2786,7 +2800,7 @@ void AMDGPUMachineCFGStructurizer::createLinearizedRegion(RegionMRT *Region, LinearizedRegion *LRegion = new LinearizedRegion(); if (SelectOut) { LRegion->addLiveOut(SelectOut); - DEBUG(dbgs() << "Add LiveOut (BBSelect): " << PrintReg(SelectOut, TRI) + DEBUG(dbgs() << "Add LiveOut (BBSelect): " << printReg(SelectOut, TRI) << "\n"); } LRegion->setRegionMRT(Region); @@ -2841,16 +2855,6 @@ static void checkRegOnlyPHIInputs(MachineFunction &MF) { } } - -INITIALIZE_PASS_BEGIN(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer", - "AMDGPU Machine CFG Structurizer", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineRegionInfoPass) -INITIALIZE_PASS_END(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer", - "AMDGPU Machine CFG Structurizer", false, false) - -char AMDGPUMachineCFGStructurizerID = AMDGPUMachineCFGStructurizer::ID; - - bool AMDGPUMachineCFGStructurizer::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -2876,6 +2880,14 @@ bool AMDGPUMachineCFGStructurizer::runOnMachineFunction(MachineFunction &MF) { return result; } +char AMDGPUMachineCFGStructurizerID = AMDGPUMachineCFGStructurizer::ID; + +INITIALIZE_PASS_BEGIN(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer", + "AMDGPU Machine CFG Structurizer", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineRegionInfoPass) +INITIALIZE_PASS_END(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer", + "AMDGPU Machine CFG Structurizer", false, false) + FunctionPass *llvm::createAMDGPUMachineCFGStructurizerPass() { return new AMDGPUMachineCFGStructurizer(); } diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 9fb7f5f88927..b7c8c1213537 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -19,7 +19,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MaxKernArgAlign(0), LDSSize(0), ABIArgOffset(0), - IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction()->getCallingConv())), + IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())), NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) { // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset, // except reserved size is not correctly aligned. diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp new file mode 100644 index 000000000000..3164140abe29 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp @@ -0,0 +1,29 @@ +//===--- AMDGPUMachineModuleInfo.cpp ----------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Machine Module Info. +/// +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMachineModuleInfo.h" +#include "llvm/IR/Module.h" + +namespace llvm { + +AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI) + : MachineModuleInfoELF(MMI) { + LLVMContext &CTX = MMI.getModule()->getContext(); + AgentSSID = CTX.getOrInsertSyncScopeID("agent"); + WorkgroupSSID = CTX.getOrInsertSyncScopeID("workgroup"); + WavefrontSSID = CTX.getOrInsertSyncScopeID("wavefront"); +} + +} // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h new file mode 100644 index 000000000000..1a728c6bd04a --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h @@ -0,0 +1,97 @@ +//===--- AMDGPUMachineModuleInfo.h ------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Machine Module Info. +/// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEMODULEINFO_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEMODULEINFO_H + +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/IR/LLVMContext.h" + +namespace llvm { + +class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF { +private: + + // All supported memory/synchronization scopes can be found here: + // http://llvm.org/docs/AMDGPUUsage.html#memory-scopes + + /// \brief Agent synchronization scope ID. + SyncScope::ID AgentSSID; + /// \brief Workgroup synchronization scope ID. + SyncScope::ID WorkgroupSSID; + /// \brief Wavefront synchronization scope ID. + SyncScope::ID WavefrontSSID; + + /// \brief In AMDGPU target synchronization scopes are inclusive, meaning a + /// larger synchronization scope is inclusive of a smaller synchronization + /// scope. + /// + /// \returns \p SSID's inclusion ordering, or "None" if \p SSID is not + /// supported by the AMDGPU target. + Optional<uint8_t> getSyncScopeInclusionOrdering(SyncScope::ID SSID) const { + if (SSID == SyncScope::SingleThread) + return 0; + else if (SSID == getWavefrontSSID()) + return 1; + else if (SSID == getWorkgroupSSID()) + return 2; + else if (SSID == getAgentSSID()) + return 3; + else if (SSID == SyncScope::System) + return 4; + + return None; + } + +public: + AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI); + + /// \returns Agent synchronization scope ID. + SyncScope::ID getAgentSSID() const { + return AgentSSID; + } + /// \returns Workgroup synchronization scope ID. + SyncScope::ID getWorkgroupSSID() const { + return WorkgroupSSID; + } + /// \returns Wavefront synchronization scope ID. + SyncScope::ID getWavefrontSSID() const { + return WavefrontSSID; + } + + /// \brief In AMDGPU target synchronization scopes are inclusive, meaning a + /// larger synchronization scope is inclusive of a smaller synchronization + /// scope. + /// + /// \returns True if synchronization scope \p A is larger than or equal to + /// synchronization scope \p B, false if synchronization scope \p A is smaller + /// than synchronization scope \p B, or "None" if either synchronization scope + /// \p A or \p B is not supported by the AMDGPU target. + Optional<bool> isSyncScopeInclusion(SyncScope::ID A, SyncScope::ID B) const { + const auto &AIO = getSyncScopeInclusionOrdering(A); + const auto &BIO = getSyncScopeInclusionOrdering(B); + if (!AIO || !BIO) + return None; + + return AIO.getValue() > BIO.getValue(); + } +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEMODULEINFO_H diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp new file mode 100644 index 000000000000..bb65636f15af --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -0,0 +1,135 @@ +//===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// \file +// \brief This post-linking pass replaces the function pointer of enqueued +// block kernel with a global variable (runtime handle) and adds +// "runtime-handle" attribute to the enqueued block kernel. +// +// In LLVM CodeGen the runtime-handle metadata will be translated to +// RuntimeHandle metadata in code object. Runtime allocates a global buffer +// for each kernel with RuntimeHandel metadata and saves the kernel address +// required for the AQL packet into the buffer. __enqueue_kernel function +// in device library knows that the invoke function pointer in the block +// literal is actually runtime handle and loads the kernel address from it +// and put it into AQL packet for dispatching. +// +// This cannot be done in FE since FE cannot create a unique global variable +// with external linkage across LLVM modules. The global variable with internal +// linkage does not work since optimization passes will try to replace loads +// of the global variable with its initialization value. +// +// It also identifies the kernels directly or indirectly enqueues kernels +// and adds "calls-enqueue-kernel" function attribute to them, which will +// be used to determine whether to emit runtime metadata for the kernel +// enqueue related hidden kernel arguments. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/User.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "amdgpu-lower-enqueued-block" + +using namespace llvm; + +namespace { + +/// \brief Lower enqueued blocks. +class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass { +public: + static char ID; + + explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {} + +private: + bool runOnModule(Module &M) override; +}; + +} // end anonymous namespace + +char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0; + +char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID = + AMDGPUOpenCLEnqueuedBlockLowering::ID; + +INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE, + "Lower OpenCL enqueued blocks", false, false) + +ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() { + return new AMDGPUOpenCLEnqueuedBlockLowering(); +} + +/// Collect direct or indrect callers of \p F and save them +/// to \p Callers. +static void collectCallers(Function *F, DenseSet<Function *> &Callers) { + for (auto U : F->users()) { + if (auto *CI = dyn_cast<CallInst>(&*U)) { + auto *Caller = CI->getParent()->getParent(); + if (Callers.count(Caller)) + continue; + Callers.insert(Caller); + collectCallers(Caller, Callers); + } + } +} + +bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { + DenseSet<Function *> Callers; + auto &C = M.getContext(); + bool Changed = false; + for (auto &F : M.functions()) { + if (F.hasFnAttribute("enqueued-block")) { + if (!F.hasOneUse() || !F.user_begin()->hasOneUse() || + !isa<ConstantExpr>(*F.user_begin()) || + !isa<ConstantExpr>(*F.user_begin()->user_begin())) { + continue; + } + auto *BitCast = cast<ConstantExpr>(*F.user_begin()); + auto *AddrCast = cast<ConstantExpr>(*BitCast->user_begin()); + auto RuntimeHandle = (F.getName() + "_runtime_handle").str(); + auto *GV = new GlobalVariable( + M, Type::getInt8Ty(C)->getPointerTo(AMDGPUAS::GLOBAL_ADDRESS), + /*IsConstant=*/true, GlobalValue::ExternalLinkage, + /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr, + GlobalValue::NotThreadLocal, AMDGPUAS::GLOBAL_ADDRESS, + /*IsExternallyInitialized=*/true); + DEBUG(dbgs() << "runtime handle created: " << *GV << '\n'); + auto *NewPtr = ConstantExpr::getPointerCast(GV, AddrCast->getType()); + AddrCast->replaceAllUsesWith(NewPtr); + F.addFnAttr("runtime-handle", RuntimeHandle); + F.setLinkage(GlobalValue::ExternalLinkage); + + // Collect direct or indirect callers of enqueue_kernel. + for (auto U : NewPtr->users()) { + if (auto *I = dyn_cast<Instruction>(&*U)) { + auto *F = I->getParent()->getParent(); + Callers.insert(F); + collectCallers(F, Callers); + } + } + Changed = true; + } + } + + for (auto F : Callers) { + if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL) + continue; + F->addFnAttr("calls-enqueue-kernel"); + } + return Changed; +} diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp index 410bd52d9c21..cd71f19760b9 100644 --- a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp +++ b/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp @@ -1,4 +1,4 @@ -//===-- AMDGPUOpenCLImageTypeLoweringPass.cpp -----------------------------===// +//===- AMDGPUOpenCLImageTypeLoweringPass.cpp ------------------------------===// // // The LLVM Compiler Infrastructure // @@ -22,40 +22,57 @@ /// Resource IDs of read-only images, write-only images and samplers are /// defined to be their index among the kernel arguments of the same /// type and access qualifier. +// //===----------------------------------------------------------------------===// #include "AMDGPU.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/Passes.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <tuple> using namespace llvm; -namespace { - -StringRef GetImageSizeFunc = "llvm.OpenCL.image.get.size"; -StringRef GetImageFormatFunc = "llvm.OpenCL.image.get.format"; -StringRef GetImageResourceIDFunc = "llvm.OpenCL.image.get.resource.id"; -StringRef GetSamplerResourceIDFunc = "llvm.OpenCL.sampler.get.resource.id"; +static StringRef GetImageSizeFunc = "llvm.OpenCL.image.get.size"; +static StringRef GetImageFormatFunc = "llvm.OpenCL.image.get.format"; +static StringRef GetImageResourceIDFunc = "llvm.OpenCL.image.get.resource.id"; +static StringRef GetSamplerResourceIDFunc = + "llvm.OpenCL.sampler.get.resource.id"; -StringRef ImageSizeArgMDType = "__llvm_image_size"; -StringRef ImageFormatArgMDType = "__llvm_image_format"; +static StringRef ImageSizeArgMDType = "__llvm_image_size"; +static StringRef ImageFormatArgMDType = "__llvm_image_format"; -StringRef KernelsMDNodeName = "opencl.kernels"; -StringRef KernelArgMDNodeNames[] = { +static StringRef KernelsMDNodeName = "opencl.kernels"; +static StringRef KernelArgMDNodeNames[] = { "kernel_arg_addr_space", "kernel_arg_access_qual", "kernel_arg_type", "kernel_arg_base_type", "kernel_arg_type_qual"}; -const unsigned NumKernelArgMDNodes = 5; +static const unsigned NumKernelArgMDNodes = 5; + +namespace { -typedef SmallVector<Metadata *, 8> MDVector; +using MDVector = SmallVector<Metadata *, 8>; struct KernelArgMD { MDVector ArgVector[NumKernelArgMDNodes]; }; @@ -303,7 +320,7 @@ class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass { CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns); // Build new MDNode. - SmallVector<llvm::Metadata *, 6> KernelMDArgs; + SmallVector<Metadata *, 6> KernelMDArgs; KernelMDArgs.push_back(ConstantAsMetadata::get(NewF)); for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) KernelMDArgs.push_back(MDNode::get(*Context, NewArgMDs.ArgVector[i])); @@ -346,7 +363,7 @@ class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass { return Modified; } - public: +public: AMDGPUOpenCLImageTypeLoweringPass() : ModulePass(ID) {} bool runOnModule(Module &M) override { @@ -363,10 +380,10 @@ class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass { } }; -char AMDGPUOpenCLImageTypeLoweringPass::ID = 0; - } // end anonymous namespace +char AMDGPUOpenCLImageTypeLoweringPass::ID = 0; + ModulePass *llvm::createAMDGPUOpenCLImageTypeLoweringPass() { return new AMDGPUOpenCLImageTypeLoweringPass(); } diff --git a/lib/Target/AMDGPU/AMDGPUPTNote.h b/lib/Target/AMDGPU/AMDGPUPTNote.h index 71b9ab699b96..b50a2eb8e9e7 100644 --- a/lib/Target/AMDGPU/AMDGPUPTNote.h +++ b/lib/Target/AMDGPU/AMDGPUPTNote.h @@ -25,18 +25,22 @@ const char SectionName[] = ".note"; const char NoteName[] = "AMD"; -// TODO: Move this enum to include/llvm/Support so it can be used in tools? +// TODO: Remove this file once we drop code object v2. enum NoteType{ + NT_AMDGPU_HSA_RESERVED_0 = 0, NT_AMDGPU_HSA_CODE_OBJECT_VERSION = 1, NT_AMDGPU_HSA_HSAIL = 2, NT_AMDGPU_HSA_ISA = 3, NT_AMDGPU_HSA_PRODUCER = 4, NT_AMDGPU_HSA_PRODUCER_OPTIONS = 5, NT_AMDGPU_HSA_EXTENSION = 6, - NT_AMDGPU_HSA_CODE_OBJECT_METADATA = 10, + NT_AMDGPU_HSA_RESERVED_7 = 7, + NT_AMDGPU_HSA_RESERVED_8 = 8, + NT_AMDGPU_HSA_RESERVED_9 = 9, NT_AMDGPU_HSA_HLDEBUG_DEBUG = 101, NT_AMDGPU_HSA_HLDEBUG_TARGET = 102 }; + } } diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 625c9b77e2de..41876ed45c8c 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -285,9 +285,9 @@ Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) { return CI; } -static VectorType *arrayTypeToVecType(Type *ArrayTy) { - return VectorType::get(ArrayTy->getArrayElementType(), - ArrayTy->getArrayNumElements()); +static VectorType *arrayTypeToVecType(ArrayType *ArrayTy) { + return VectorType::get(ArrayTy->getElementType(), + ArrayTy->getNumElements()); } static Value * @@ -346,10 +346,9 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) { // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these // could also be promoted but we don't currently handle this case if (!AllocaTy || - AllocaTy->getElementType()->isVectorTy() || - AllocaTy->getElementType()->isArrayTy() || AllocaTy->getNumElements() > 4 || - AllocaTy->getNumElements() < 2) { + AllocaTy->getNumElements() < 2 || + !VectorType::isValidElementType(AllocaTy->getElementType())) { DEBUG(dbgs() << " Cannot convert type to vector\n"); return false; } diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 623b2c88ab8f..1ed02fae085a 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -17,9 +17,9 @@ #include "SIRegisterInfo.h" #include "llvm/CodeGen/GlobalISel/RegisterBank.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/Constants.h" -#include "llvm/Target/TargetRegisterInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" #define GET_TARGET_REGBANK_IMPL #include "AMDGPUGenRegisterBank.inc" @@ -29,10 +29,6 @@ using namespace llvm; -#ifndef LLVM_BUILD_GLOBAL_ISEL -#error "You shouldn't build this" -#endif - AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI) : AMDGPUGenRegisterBankInfo(), TRI(static_cast<const SIRegisterInfo*>(&TRI)) { diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp index ff58aa5741a1..5e4d33aaa691 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -43,10 +43,11 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { // Forced to be here by one .inc const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( const MachineFunction *MF) const { - CallingConv::ID CC = MF->getFunction()->getCallingConv(); + CallingConv::ID CC = MF->getFunction().getCallingConv(); switch (CC) { case CallingConv::C: case CallingConv::Fast: + case CallingConv::Cold: return CSR_AMDGPU_HighRegs_SaveList; default: { // Dummy to not crash RegisterClassInfo. @@ -56,11 +57,17 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs( } } +const MCPhysReg * +SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { + return nullptr; +} + const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { switch (CC) { case CallingConv::C: case CallingConv::Fast: + case CallingConv::Cold: return CSR_AMDGPU_HighRegs_RegMask; default: return nullptr; diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/lib/Target/AMDGPU/AMDGPURegisterInfo.td index ba0490abee8c..3bbcba826f63 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.td +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.td @@ -17,8 +17,6 @@ foreach Index = 0-15 in { def sub#Index : SubRegIndex<32, !shl(Index, 5)>; } -def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">; - } include "R600RegisterInfo.td" diff --git a/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp new file mode 100644 index 000000000000..83e56a9ab495 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp @@ -0,0 +1,483 @@ +//===- AMDGPURewriteOutArgumentsPass.cpp - Create struct returns ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This pass attempts to replace out argument usage with a return of a +/// struct. +/// +/// We can support returning a lot of values directly in registers, but +/// idiomatic C code frequently uses a pointer argument to return a second value +/// rather than returning a struct by value. GPU stack access is also quite +/// painful, so we want to avoid that if possible. Passing a stack object +/// pointer to a function also requires an additional address expansion code +/// sequence to convert the pointer to be relative to the kernel's scratch wave +/// offset register since the callee doesn't know what stack frame the incoming +/// pointer is relative to. +/// +/// The goal is to try rewriting code that looks like this: +/// +/// int foo(int a, int b, int* out) { +/// *out = bar(); +/// return a + b; +/// } +/// +/// into something like this: +/// +/// std::pair<int, int> foo(int a, int b) { +/// return std::make_pair(a + b, bar()); +/// } +/// +/// Typically the incoming pointer is a simple alloca for a temporary variable +/// to use the API, which if replaced with a struct return will be easily SROA'd +/// out when the stub function we create is inlined +/// +/// This pass introduces the struct return, but leaves the unused pointer +/// arguments and introduces a new stub function calling the struct returning +/// body. DeadArgumentElimination should be run after this to clean these up. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/MemoryLocation.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <utility> + +#define DEBUG_TYPE "amdgpu-rewrite-out-arguments" + +using namespace llvm; + +static cl::opt<bool> AnyAddressSpace( + "amdgpu-any-address-space-out-arguments", + cl::desc("Replace pointer out arguments with " + "struct returns for non-private address space"), + cl::Hidden, + cl::init(false)); + +static cl::opt<unsigned> MaxNumRetRegs( + "amdgpu-max-return-arg-num-regs", + cl::desc("Approximately limit number of return registers for replacing out arguments"), + cl::Hidden, + cl::init(16)); + +STATISTIC(NumOutArgumentsReplaced, + "Number out arguments moved to struct return values"); +STATISTIC(NumOutArgumentFunctionsReplaced, + "Number of functions with out arguments moved to struct return values"); + +namespace { + +class AMDGPURewriteOutArguments : public FunctionPass { +private: + const DataLayout *DL = nullptr; + MemoryDependenceResults *MDA = nullptr; + + bool checkArgumentUses(Value &Arg) const; + bool isOutArgumentCandidate(Argument &Arg) const; + +#ifndef NDEBUG + bool isVec3ToVec4Shuffle(Type *Ty0, Type* Ty1) const; +#endif + +public: + static char ID; + + AMDGPURewriteOutArguments() : FunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MemoryDependenceWrapperPass>(); + FunctionPass::getAnalysisUsage(AU); + } + + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; +}; + +} // end anonymous namespace + +INITIALIZE_PASS_BEGIN(AMDGPURewriteOutArguments, DEBUG_TYPE, + "AMDGPU Rewrite Out Arguments", false, false) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) +INITIALIZE_PASS_END(AMDGPURewriteOutArguments, DEBUG_TYPE, + "AMDGPU Rewrite Out Arguments", false, false) + +char AMDGPURewriteOutArguments::ID = 0; + +bool AMDGPURewriteOutArguments::checkArgumentUses(Value &Arg) const { + const int MaxUses = 10; + int UseCount = 0; + + for (Use &U : Arg.uses()) { + StoreInst *SI = dyn_cast<StoreInst>(U.getUser()); + if (UseCount > MaxUses) + return false; + + if (!SI) { + auto *BCI = dyn_cast<BitCastInst>(U.getUser()); + if (!BCI || !BCI->hasOneUse()) + return false; + + // We don't handle multiple stores currently, so stores to aggregate + // pointers aren't worth the trouble since they are canonically split up. + Type *DestEltTy = BCI->getType()->getPointerElementType(); + if (DestEltTy->isAggregateType()) + return false; + + // We could handle these if we had a convenient way to bitcast between + // them. + Type *SrcEltTy = Arg.getType()->getPointerElementType(); + if (SrcEltTy->isArrayTy()) + return false; + + // Special case handle structs with single members. It is useful to handle + // some casts between structs and non-structs, but we can't bitcast + // directly between them. directly bitcast between them. Blender uses + // some casts that look like { <3 x float> }* to <4 x float>* + if ((SrcEltTy->isStructTy() && (SrcEltTy->getNumContainedTypes() != 1))) + return false; + + // Clang emits OpenCL 3-vector type accesses with a bitcast to the + // equivalent 4-element vector and accesses that, and we're looking for + // this pointer cast. + if (DL->getTypeAllocSize(SrcEltTy) != DL->getTypeAllocSize(DestEltTy)) + return false; + + return checkArgumentUses(*BCI); + } + + if (!SI->isSimple() || + U.getOperandNo() != StoreInst::getPointerOperandIndex()) + return false; + + ++UseCount; + } + + // Skip unused arguments. + return UseCount > 0; +} + +bool AMDGPURewriteOutArguments::isOutArgumentCandidate(Argument &Arg) const { + const unsigned MaxOutArgSizeBytes = 4 * MaxNumRetRegs; + PointerType *ArgTy = dyn_cast<PointerType>(Arg.getType()); + + // TODO: It might be useful for any out arguments, not just privates. + if (!ArgTy || (ArgTy->getAddressSpace() != DL->getAllocaAddrSpace() && + !AnyAddressSpace) || + Arg.hasByValAttr() || Arg.hasStructRetAttr() || + DL->getTypeStoreSize(ArgTy->getPointerElementType()) > MaxOutArgSizeBytes) { + return false; + } + + return checkArgumentUses(Arg); +} + +bool AMDGPURewriteOutArguments::doInitialization(Module &M) { + DL = &M.getDataLayout(); + return false; +} + +#ifndef NDEBUG +bool AMDGPURewriteOutArguments::isVec3ToVec4Shuffle(Type *Ty0, Type* Ty1) const { + VectorType *VT0 = dyn_cast<VectorType>(Ty0); + VectorType *VT1 = dyn_cast<VectorType>(Ty1); + if (!VT0 || !VT1) + return false; + + if (VT0->getNumElements() != 3 || + VT1->getNumElements() != 4) + return false; + + return DL->getTypeSizeInBits(VT0->getElementType()) == + DL->getTypeSizeInBits(VT1->getElementType()); +} +#endif + +bool AMDGPURewriteOutArguments::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + // TODO: Could probably handle variadic functions. + if (F.isVarArg() || F.hasStructRetAttr() || + AMDGPU::isEntryFunctionCC(F.getCallingConv())) + return false; + + MDA = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); + + unsigned ReturnNumRegs = 0; + SmallSet<int, 4> OutArgIndexes; + SmallVector<Type *, 4> ReturnTypes; + Type *RetTy = F.getReturnType(); + if (!RetTy->isVoidTy()) { + ReturnNumRegs = DL->getTypeStoreSize(RetTy) / 4; + + if (ReturnNumRegs >= MaxNumRetRegs) + return false; + + ReturnTypes.push_back(RetTy); + } + + SmallVector<Argument *, 4> OutArgs; + for (Argument &Arg : F.args()) { + if (isOutArgumentCandidate(Arg)) { + DEBUG(dbgs() << "Found possible out argument " << Arg + << " in function " << F.getName() << '\n'); + OutArgs.push_back(&Arg); + } + } + + if (OutArgs.empty()) + return false; + + using ReplacementVec = SmallVector<std::pair<Argument *, Value *>, 4>; + + DenseMap<ReturnInst *, ReplacementVec> Replacements; + + SmallVector<ReturnInst *, 4> Returns; + for (BasicBlock &BB : F) { + if (ReturnInst *RI = dyn_cast<ReturnInst>(&BB.back())) + Returns.push_back(RI); + } + + if (Returns.empty()) + return false; + + bool Changing; + + do { + Changing = false; + + // Keep retrying if we are able to successfully eliminate an argument. This + // helps with cases with multiple arguments which may alias, such as in a + // sincos implemntation. If we have 2 stores to arguments, on the first + // attempt the MDA query will succeed for the second store but not the + // first. On the second iteration we've removed that out clobbering argument + // (by effectively moving it into another function) and will find the second + // argument is OK to move. + for (Argument *OutArg : OutArgs) { + bool ThisReplaceable = true; + SmallVector<std::pair<ReturnInst *, StoreInst *>, 4> ReplaceableStores; + + Type *ArgTy = OutArg->getType()->getPointerElementType(); + + // Skip this argument if converting it will push us over the register + // count to return limit. + + // TODO: This is an approximation. When legalized this could be more. We + // can ask TLI for exactly how many. + unsigned ArgNumRegs = DL->getTypeStoreSize(ArgTy) / 4; + if (ArgNumRegs + ReturnNumRegs > MaxNumRetRegs) + continue; + + // An argument is convertible only if all exit blocks are able to replace + // it. + for (ReturnInst *RI : Returns) { + BasicBlock *BB = RI->getParent(); + + MemDepResult Q = MDA->getPointerDependencyFrom(MemoryLocation(OutArg), + true, BB->end(), BB, RI); + StoreInst *SI = nullptr; + if (Q.isDef()) + SI = dyn_cast<StoreInst>(Q.getInst()); + + if (SI) { + DEBUG(dbgs() << "Found out argument store: " << *SI << '\n'); + ReplaceableStores.emplace_back(RI, SI); + } else { + ThisReplaceable = false; + break; + } + } + + if (!ThisReplaceable) + continue; // Try the next argument candidate. + + for (std::pair<ReturnInst *, StoreInst *> Store : ReplaceableStores) { + Value *ReplVal = Store.second->getValueOperand(); + + auto &ValVec = Replacements[Store.first]; + if (llvm::find_if(ValVec, + [OutArg](const std::pair<Argument *, Value *> &Entry) { + return Entry.first == OutArg;}) != ValVec.end()) { + DEBUG(dbgs() << "Saw multiple out arg stores" << *OutArg << '\n'); + // It is possible to see stores to the same argument multiple times, + // but we expect these would have been optimized out already. + ThisReplaceable = false; + break; + } + + ValVec.emplace_back(OutArg, ReplVal); + Store.second->eraseFromParent(); + } + + if (ThisReplaceable) { + ReturnTypes.push_back(ArgTy); + OutArgIndexes.insert(OutArg->getArgNo()); + ++NumOutArgumentsReplaced; + Changing = true; + } + } + } while (Changing); + + if (Replacements.empty()) + return false; + + LLVMContext &Ctx = F.getParent()->getContext(); + StructType *NewRetTy = StructType::create(Ctx, ReturnTypes, F.getName()); + + FunctionType *NewFuncTy = FunctionType::get(NewRetTy, + F.getFunctionType()->params(), + F.isVarArg()); + + DEBUG(dbgs() << "Computed new return type: " << *NewRetTy << '\n'); + + Function *NewFunc = Function::Create(NewFuncTy, Function::PrivateLinkage, + F.getName() + ".body"); + F.getParent()->getFunctionList().insert(F.getIterator(), NewFunc); + NewFunc->copyAttributesFrom(&F); + NewFunc->setComdat(F.getComdat()); + + // We want to preserve the function and param attributes, but need to strip + // off any return attributes, e.g. zeroext doesn't make sense with a struct. + NewFunc->stealArgumentListFrom(F); + + AttrBuilder RetAttrs; + RetAttrs.addAttribute(Attribute::SExt); + RetAttrs.addAttribute(Attribute::ZExt); + RetAttrs.addAttribute(Attribute::NoAlias); + NewFunc->removeAttributes(AttributeList::ReturnIndex, RetAttrs); + // TODO: How to preserve metadata? + + // Move the body of the function into the new rewritten function, and replace + // this function with a stub. + NewFunc->getBasicBlockList().splice(NewFunc->begin(), F.getBasicBlockList()); + + for (std::pair<ReturnInst *, ReplacementVec> &Replacement : Replacements) { + ReturnInst *RI = Replacement.first; + IRBuilder<> B(RI); + B.SetCurrentDebugLocation(RI->getDebugLoc()); + + int RetIdx = 0; + Value *NewRetVal = UndefValue::get(NewRetTy); + + Value *RetVal = RI->getReturnValue(); + if (RetVal) + NewRetVal = B.CreateInsertValue(NewRetVal, RetVal, RetIdx++); + + for (std::pair<Argument *, Value *> ReturnPoint : Replacement.second) { + Argument *Arg = ReturnPoint.first; + Value *Val = ReturnPoint.second; + Type *EltTy = Arg->getType()->getPointerElementType(); + if (Val->getType() != EltTy) { + Type *EffectiveEltTy = EltTy; + if (StructType *CT = dyn_cast<StructType>(EltTy)) { + assert(CT->getNumContainedTypes() == 1); + EffectiveEltTy = CT->getContainedType(0); + } + + if (DL->getTypeSizeInBits(EffectiveEltTy) != + DL->getTypeSizeInBits(Val->getType())) { + assert(isVec3ToVec4Shuffle(EffectiveEltTy, Val->getType())); + Val = B.CreateShuffleVector(Val, UndefValue::get(Val->getType()), + { 0, 1, 2 }); + } + + Val = B.CreateBitCast(Val, EffectiveEltTy); + + // Re-create single element composite. + if (EltTy != EffectiveEltTy) + Val = B.CreateInsertValue(UndefValue::get(EltTy), Val, 0); + } + + NewRetVal = B.CreateInsertValue(NewRetVal, Val, RetIdx++); + } + + if (RetVal) + RI->setOperand(0, NewRetVal); + else { + B.CreateRet(NewRetVal); + RI->eraseFromParent(); + } + } + + SmallVector<Value *, 16> StubCallArgs; + for (Argument &Arg : F.args()) { + if (OutArgIndexes.count(Arg.getArgNo())) { + // It's easier to preserve the type of the argument list. We rely on + // DeadArgumentElimination to take care of these. + StubCallArgs.push_back(UndefValue::get(Arg.getType())); + } else { + StubCallArgs.push_back(&Arg); + } + } + + BasicBlock *StubBB = BasicBlock::Create(Ctx, "", &F); + IRBuilder<> B(StubBB); + CallInst *StubCall = B.CreateCall(NewFunc, StubCallArgs); + + int RetIdx = RetTy->isVoidTy() ? 0 : 1; + for (Argument &Arg : F.args()) { + if (!OutArgIndexes.count(Arg.getArgNo())) + continue; + + PointerType *ArgType = cast<PointerType>(Arg.getType()); + + auto *EltTy = ArgType->getElementType(); + unsigned Align = Arg.getParamAlignment(); + if (Align == 0) + Align = DL->getABITypeAlignment(EltTy); + + Value *Val = B.CreateExtractValue(StubCall, RetIdx++); + Type *PtrTy = Val->getType()->getPointerTo(ArgType->getAddressSpace()); + + // We can peek through bitcasts, so the type may not match. + Value *PtrVal = B.CreateBitCast(&Arg, PtrTy); + + B.CreateAlignedStore(Val, PtrVal, Align); + } + + if (!RetTy->isVoidTy()) { + B.CreateRet(B.CreateExtractValue(StubCall, 0)); + } else { + B.CreateRetVoid(); + } + + // The function is now a stub we want to inline. + F.addFnAttr(Attribute::AlwaysInline); + + ++NumOutArgumentFunctionsReplaced; + return true; +} + +FunctionPass *llvm::createAMDGPURewriteOutArgumentsPass() { + return new AMDGPURewriteOutArguments(); +} diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 779617629010..80feaa44766f 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -15,17 +15,15 @@ #include "AMDGPUSubtarget.h" #include "AMDGPU.h" #include "AMDGPUTargetMachine.h" -#ifdef LLVM_BUILD_GLOBAL_ISEL #include "AMDGPUCallLowering.h" #include "AMDGPUInstructionSelector.h" #include "AMDGPULegalizerInfo.h" #include "AMDGPURegisterBankInfo.h" -#endif #include "SIMachineFunctionInfo.h" #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/IR/MDBuilder.h" -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include <algorithm> using namespace llvm; @@ -50,14 +48,27 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, // for SI has the unhelpful behavior that it unsets everything else if you // disable it. - SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,"); + SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,"); + if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. - FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; + FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,"; + + // FIXME: I don't think think Evergreen has any useful support for + // denormals, but should be checked. Should we issue a warning somewhere + // if someone tries to enable these? + if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + FullFS += "+fp64-fp16-denormals,"; + } else { + FullFS += "-fp32-denormals,"; + } FullFS += FS; ParseSubtargetFeatures(GPU, FullFS); + // We don't support FP64 for EG/NI atm. + assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); + // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es // on VI and newer hardware to avoid assertion failures due to missing ADDR64 // variants of MUBUF instructions. @@ -65,45 +76,24 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, FlatForGlobal = true; } - // FIXME: I don't think think Evergreen has any useful support for - // denormals, but should be checked. Should we issue a warning somewhere - // if someone tries to enable these? - if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - FP64FP16Denormals = false; - FP32Denormals = false; - } - // Set defaults if needed. if (MaxPrivateElementSize == 0) MaxPrivateElementSize = 4; - return *this; -} + if (LDSBankCount == 0) + LDSBankCount = 32; -#ifdef LLVM_BUILD_GLOBAL_ISEL -namespace { + if (TT.getArch() == Triple::amdgcn) { + if (LocalMemorySize == 0) + LocalMemorySize = 32768; -struct SIGISelActualAccessor : public GISelAccessor { - std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; - std::unique_ptr<InstructionSelector> InstSelector; - std::unique_ptr<LegalizerInfo> Legalizer; - std::unique_ptr<RegisterBankInfo> RegBankInfo; - const AMDGPUCallLowering *getCallLowering() const override { - return CallLoweringInfo.get(); + // Do something sensible for unspecified target. + if (!HasMovrel && !HasVGPRIndexMode) + HasMovrel = true; } - const InstructionSelector *getInstructionSelector() const override { - return InstSelector.get(); - } - const LegalizerInfo *getLegalizerInfo() const override { - return Legalizer.get(); - } - const RegisterBankInfo *getRegBankInfo() const override { - return RegBankInfo.get(); - } -}; -} // end anonymous namespace -#endif + return *this; +} AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const TargetMachine &TM) @@ -111,7 +101,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, TargetTriple(TT), Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), IsaVersion(ISAVersion0_0_0), - WavefrontSize(64), + WavefrontSize(0), LocalMemorySize(0), LDSBankCount(0), MaxPrivateElementSize(0), @@ -125,6 +115,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, DX10Clamp(false), FlatForGlobal(false), AutoWaitcntBeforeBarrier(false), + CodeObjectV3(false), UnalignedScratchAccess(false), UnalignedBufferAccess(false), @@ -135,6 +126,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, DebuggerReserveRegs(false), DebuggerEmitPrologue(false), + EnableHugePrivateBuffer(false), EnableVGPRSpilling(false), EnablePromoteAlloca(false), EnableLoadStoreOpt(false), @@ -143,15 +135,17 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, DumpCode(false), FP64(false), + FMA(false), IsGCN(false), - GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), GFX9Insts(false), SGPRInitBug(false), HasSMemRealTime(false), Has16BitInsts(false), + HasIntClamp(false), HasVOP3PInsts(false), + HasMadMixInsts(false), HasMovrel(false), HasVGPRIndexMode(false), HasScalarStores(false), @@ -167,6 +161,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FlatInstOffsets(false), FlatGlobalInsts(false), FlatScratchInsts(false), + AddNoCarryInsts(false), R600ALUInst(false), CaymanISA(false), @@ -203,14 +198,31 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, return NumWaves; } +std::pair<unsigned, unsigned> +AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { + switch (CC) { + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_LS: + case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_ES: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + return std::make_pair(1, getWavefrontSize()); + default: + return std::make_pair(1, 16 * getWavefrontSize()); + } +} + std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes( const Function &F) const { + // FIXME: 1024 if function. // Default minimum/maximum flat work group sizes. std::pair<unsigned, unsigned> Default = - AMDGPU::isCompute(F.getCallingConv()) ? - std::pair<unsigned, unsigned>(getWavefrontSize() * 2, - getWavefrontSize() * 4) : - std::pair<unsigned, unsigned>(1, getWavefrontSize()); + getDefaultFlatWorkGroupSize(F.getCallingConv()); // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa // starts using "amdgpu-flat-work-group-size" attribute. @@ -357,18 +369,12 @@ SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this), FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), TLInfo(TM, *this) { -#ifndef LLVM_BUILD_GLOBAL_ISEL - GISelAccessor *GISel = new GISelAccessor(); -#else - SIGISelActualAccessor *GISel = new SIGISelActualAccessor(); - GISel->CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); - GISel->Legalizer.reset(new AMDGPULegalizerInfo()); - - GISel->RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); - GISel->InstSelector.reset(new AMDGPUInstructionSelector( - *this, *static_cast<AMDGPURegisterBankInfo *>(GISel->RegBankInfo.get()))); -#endif - setGISelAccessor(*GISel); + CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); + Legalizer.reset(new AMDGPULegalizerInfo()); + + RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); + InstSelector.reset(new AMDGPUInstructionSelector( + *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()))); } void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, @@ -462,7 +468,7 @@ unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { } unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { - const Function &F = *MF.getFunction(); + const Function &F = MF.getFunction(); const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); // Compute maximum number of SGPRs function can use using default/requested @@ -512,7 +518,7 @@ unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { } unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { - const Function &F = *MF.getFunction(); + const Function &F = MF.getFunction(); const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); // Compute maximum number of VGPRs function can use using default/requested @@ -544,3 +550,59 @@ unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { return MaxNumVGPRs - getReservedNumVGPRs(MF); } + +namespace { +struct MemOpClusterMutation : ScheduleDAGMutation { + const SIInstrInfo *TII; + + MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {} + + void apply(ScheduleDAGInstrs *DAGInstrs) override { + ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs); + + SUnit *SUa = nullptr; + // Search for two consequent memory operations and link them + // to prevent scheduler from moving them apart. + // In DAG pre-process SUnits are in the original order of + // the instructions before scheduling. + for (SUnit &SU : DAG->SUnits) { + MachineInstr &MI2 = *SU.getInstr(); + if (!MI2.mayLoad() && !MI2.mayStore()) { + SUa = nullptr; + continue; + } + if (!SUa) { + SUa = &SU; + continue; + } + + MachineInstr &MI1 = *SUa->getInstr(); + if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) || + (TII->isFLAT(MI1) && TII->isFLAT(MI2)) || + (TII->isSMRD(MI1) && TII->isSMRD(MI2)) || + (TII->isDS(MI1) && TII->isDS(MI2))) { + SU.addPredBarrier(SUa); + + for (const SDep &SI : SU.Preds) { + if (SI.getSUnit() != SUa) + SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial)); + } + + if (&SU != &DAG->ExitSU) { + for (const SDep &SI : SUa->Succs) { + if (SI.getSUnit() != &SU) + SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial)); + } + } + } + + SUa = &SU; + } + } +}; +} // namespace + +void SISubtarget::getPostRAMutations( + std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { + Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo)); +} diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index d4b6a5fe8020..cf4a691d4b58 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -16,6 +16,7 @@ #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H #include "AMDGPU.h" +#include "AMDGPUCallLowering.h" #include "R600FrameLowering.h" #include "R600ISelLowering.h" #include "R600InstrInfo.h" @@ -25,7 +26,9 @@ #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/Triple.h" -#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/MC/MCInstrItineraries.h" @@ -63,16 +66,14 @@ public: ISAVersion7_0_1, ISAVersion7_0_2, ISAVersion7_0_3, + ISAVersion7_0_4, ISAVersion8_0_0, ISAVersion8_0_1, ISAVersion8_0_2, ISAVersion8_0_3, - ISAVersion8_0_4, ISAVersion8_1_0, ISAVersion9_0_0, - ISAVersion9_0_1, - ISAVersion9_0_2, - ISAVersion9_0_3 + ISAVersion9_0_2 }; enum TrapHandlerAbi { @@ -116,6 +117,7 @@ protected: bool DX10Clamp; bool FlatForGlobal; bool AutoWaitcntBeforeBarrier; + bool CodeObjectV3; bool UnalignedScratchAccess; bool UnalignedBufferAccess; bool HasApertureRegs; @@ -126,6 +128,7 @@ protected: bool DebuggerEmitPrologue; // Used as options. + bool EnableHugePrivateBuffer; bool EnableVGPRSpilling; bool EnablePromoteAlloca; bool EnableLoadStoreOpt; @@ -135,15 +138,17 @@ protected: // Subtarget statically properties set by tablegen bool FP64; + bool FMA; bool IsGCN; - bool GCN1Encoding; bool GCN3Encoding; bool CIInsts; bool GFX9Insts; bool SGPRInitBug; bool HasSMemRealTime; bool Has16BitInsts; + bool HasIntClamp; bool HasVOP3PInsts; + bool HasMadMixInsts; bool HasMovrel; bool HasVGPRIndexMode; bool HasScalarStores; @@ -159,6 +164,7 @@ protected: bool FlatInstOffsets; bool FlatGlobalInsts; bool FlatScratchInsts; + bool AddNoCarryInsts; bool R600ALUInst; bool CaymanISA; bool CFALUBug; @@ -210,6 +216,10 @@ public: TargetTriple.getEnvironmentName() == "amdgizcl"; } + bool isAmdPalOS() const { + return TargetTriple.getOS() == Triple::AMDPAL; + } + Generation getGeneration() const { return Gen; } @@ -218,6 +228,10 @@ public: return WavefrontSize; } + unsigned getWavefrontSizeLog2() const { + return Log2_32(WavefrontSize); + } + int getLocalMemorySize() const { return LocalMemorySize; } @@ -238,11 +252,15 @@ public: return Has16BitInsts; } + bool hasIntClamp() const { + return HasIntClamp; + } + bool hasVOP3PInsts() const { return HasVOP3PInsts; } - bool hasHWFP64() const { + bool hasFP64() const { return FP64; } @@ -305,6 +323,18 @@ public: return getGeneration() >= GFX9; } + bool hasMadMixInsts() const { + return HasMadMixInsts; + } + + bool hasSBufferLoadStoreAtomicDwordxN() const { + // Only use the "x1" variants on GFX9 or don't use the buffer variants. + // For x2 and higher variants, if the accessed region spans 2 VM pages and + // the second page is unmapped, the hw hangs. + // TODO: There is one future GFX9 chip that doesn't have this bug. + return getGeneration() != GFX9; + } + bool hasCARRY() const { return (getGeneration() >= EVERGREEN); } @@ -317,10 +347,18 @@ public: return CaymanISA; } + bool hasFMA() const { + return FMA; + } + TrapHandlerAbi getTrapHandlerAbi() const { return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; } + bool enableHugePrivateBuffer() const { + return EnableHugePrivateBuffer; + } + bool isPromoteAllocaEnabled() const { return EnablePromoteAlloca; } @@ -344,7 +382,7 @@ public: unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const { const auto *MFI = MF.getInfo<SIMachineFunctionInfo>(); - return getOccupancyWithLocalMemSize(MFI->getLDSSize(), *MF.getFunction()); + return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); } bool hasFP16Denormals() const { @@ -372,17 +410,27 @@ public: } bool enableIEEEBit(const MachineFunction &MF) const { - return AMDGPU::isCompute(MF.getFunction()->getCallingConv()); + return AMDGPU::isCompute(MF.getFunction().getCallingConv()); } bool useFlatForGlobal() const { return FlatForGlobal; } + /// \returns If MUBUF instructions always perform range checking, even for + /// buffer resources used for private memory access. + bool privateMemoryResourceIsRangeChecked() const { + return getGeneration() < AMDGPUSubtarget::GFX9; + } + bool hasAutoWaitcntBeforeBarrier() const { return AutoWaitcntBeforeBarrier; } + bool hasCodeObjectV3() const { + return CodeObjectV3; + } + bool hasUnalignedBufferAccess() const { return UnalignedBufferAccess; } @@ -419,19 +467,37 @@ public: return FlatScratchInsts; } + bool hasD16LoadStore() const { + return getGeneration() >= GFX9; + } + + /// Return if most LDS instructions have an m0 use that require m0 to be + /// iniitalized. + bool ldsRequiresM0Init() const { + return getGeneration() < GFX9; + } + + bool hasAddNoCarry() const { + return AddNoCarryInsts; + } + bool isMesaKernel(const MachineFunction &MF) const { - return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv()); + return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction().getCallingConv()); } // Covers VS/PS/CS graphics shaders bool isMesaGfxShader(const MachineFunction &MF) const { - return isMesa3DOS() && AMDGPU::isShader(MF.getFunction()->getCallingConv()); + return isMesa3DOS() && AMDGPU::isShader(MF.getFunction().getCallingConv()); } bool isAmdCodeObjectV2(const MachineFunction &MF) const { return isAmdHsaOS() || isMesaKernel(MF); } + bool hasMad64_32() const { + return getGeneration() >= SEA_ISLANDS; + } + bool hasFminFmaxLegacy() const { return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; } @@ -558,6 +624,9 @@ public: FlatWorkGroupSize); } + /// \returns Default range flat work group size for a calling convention. + std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const; + /// \returns Subtarget's default pair of minimum/maximum flat work group sizes /// for function \p F, or minimum/maximum flat work group sizes explicitly /// requested using "amdgpu-flat-work-group-size" attribute attached to @@ -626,7 +695,12 @@ private: SIInstrInfo InstrInfo; SIFrameLowering FrameLowering; SITargetLowering TLInfo; - std::unique_ptr<GISelAccessor> GISel; + + /// GlobalISel related APIs. + std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; + std::unique_ptr<InstructionSelector> InstSelector; + std::unique_ptr<LegalizerInfo> Legalizer; + std::unique_ptr<RegisterBankInfo> RegBankInfo; public: SISubtarget(const Triple &TT, StringRef CPU, StringRef FS, @@ -645,33 +719,25 @@ public: } const CallLowering *getCallLowering() const override { - assert(GISel && "Access to GlobalISel APIs not set"); - return GISel->getCallLowering(); + return CallLoweringInfo.get(); } const InstructionSelector *getInstructionSelector() const override { - assert(GISel && "Access to GlobalISel APIs not set"); - return GISel->getInstructionSelector(); + return InstSelector.get(); } const LegalizerInfo *getLegalizerInfo() const override { - assert(GISel && "Access to GlobalISel APIs not set"); - return GISel->getLegalizerInfo(); + return Legalizer.get(); } const RegisterBankInfo *getRegBankInfo() const override { - assert(GISel && "Access to GlobalISel APIs not set"); - return GISel->getRegBankInfo(); + return RegBankInfo.get(); } const SIRegisterInfo *getRegisterInfo() const override { return &InstrInfo.getRegisterInfo(); } - void setGISelAccessor(GISelAccessor &GISel) { - this->GISel.reset(&GISel); - } - // XXX - Why is this here if it isn't in the default pass set? bool enableEarlyIfConversion() const override { return true; @@ -755,11 +821,16 @@ public: return getGeneration() >= AMDGPUSubtarget::GFX9; } - bool hasReadM0Hazard() const { + bool hasReadM0MovRelInterpHazard() const { return getGeneration() >= AMDGPUSubtarget::GFX9; } - unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const; + bool hasReadM0SendMsgHazard() const { + return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS; + } + + unsigned getKernArgSegmentSize(const MachineFunction &MF, + unsigned ExplictArgBytes) const; /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; @@ -865,6 +936,10 @@ public: /// subtarget's specifications, or does not meet number of waves per execution /// unit requirement. unsigned getMaxNumVGPRs(const MachineFunction &MF) const; + + void getPostRAMutations( + std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) + const override; }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index dc868f010d85..6984f4e71613 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -31,6 +31,7 @@ #include "llvm/CodeGen/GlobalISel/Legalizer.h" #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetLoweringObjectFile.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" @@ -39,7 +40,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/TargetRegistry.h" -#include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" @@ -117,10 +117,23 @@ static cl::opt<bool> EnableSIInsertWaitcntsPass( cl::init(true)); // Option to run late CFG structurizer -static cl::opt<bool> LateCFGStructurize( +static cl::opt<bool, true> LateCFGStructurize( "amdgpu-late-structurize", cl::desc("Enable late CFG structurization"), - cl::init(false), + cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), + cl::Hidden); + +static cl::opt<bool> EnableAMDGPUFunctionCalls( + "amdgpu-function-calls", + cl::Hidden, + cl::desc("Enable AMDGPU function call support"), + cl::init(false)); + +// Enable lib calls simplifications +static cl::opt<bool> EnableLibCallSimplify( + "amdgpu-simplify-libcall", + cl::desc("Enable mdgpu library simplifications"), + cl::init(true), cl::Hidden); extern "C" void LLVMInitializeAMDGPUTarget() { @@ -129,20 +142,29 @@ extern "C" void LLVMInitializeAMDGPUTarget() { RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget()); PassRegistry *PR = PassRegistry::getPassRegistry(); + initializeR600ClauseMergePassPass(*PR); + initializeR600ControlFlowFinalizerPass(*PR); + initializeR600PacketizerPass(*PR); + initializeR600ExpandSpecialInstrsPassPass(*PR); + initializeR600VectorRegMergerPass(*PR); + initializeAMDGPUDAGToDAGISelPass(*PR); initializeSILowerI1CopiesPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); initializeSIFoldOperandsPass(*PR); initializeSIPeepholeSDWAPass(*PR); initializeSIShrinkInstructionsPass(*PR); - initializeSIFixControlFlowLiveIntervalsPass(*PR); + initializeSIOptimizeExecMaskingPreRAPass(*PR); initializeSILoadStoreOptimizerPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); + initializeAMDGPUArgumentUsageInfoPass(*PR); initializeAMDGPULowerIntrinsicsPass(*PR); + initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); + initializeAMDGPURewriteOutArgumentsPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowPass(*PR); initializeSIInsertWaitsPass(*PR); @@ -150,10 +172,15 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); initializeSIInsertSkipsPass(*PR); + initializeSIMemoryLegalizerPass(*PR); initializeSIDebuggerInsertNopsPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); + initializeSIFixWWMLivenessPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); + initializeAMDGPUUseNativeCallsPass(*PR); + initializeAMDGPUSimplifyLibCallsPass(*PR); + initializeAMDGPUInlinerPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -192,6 +219,16 @@ static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { GCNIterativeScheduler::SCHEDULE_MINREGFORCED); } +static ScheduleDAGInstrs * +createIterativeILPMachineScheduler(MachineSchedContext *C) { + auto DAG = new GCNIterativeScheduler(C, + GCNIterativeScheduler::SCHEDULE_ILP); + DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); + return DAG; +} + static MachineSchedRegistry R600SchedRegistry("r600", "Run R600's custom scheduler", createR600MachineScheduler); @@ -215,9 +252,18 @@ GCNMinRegSchedRegistry("gcn-minreg", "Run GCN iterative scheduler for minimal register usage (experimental)", createMinRegScheduler); +static MachineSchedRegistry +GCNILPSchedRegistry("gcn-ilp", + "Run GCN iterative scheduler for ILP scheduling (experimental)", + createIterativeILPMachineScheduler); + static StringRef computeDataLayout(const Triple &TT) { if (TT.getArch() == Triple::r600) { // 32-bit pointers. + if (TT.getEnvironmentName() == "amdgiz" || + TT.getEnvironmentName() == "amdgizcl") + return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"; return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; } @@ -239,9 +285,8 @@ static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { if (!GPU.empty()) return GPU; - // HSA only supports CI+, so change the default GPU to a CI for HSA. if (TT.getArch() == Triple::amdgcn) - return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti"; + return "generic"; return "r600"; } @@ -252,21 +297,30 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { return Reloc::PIC_; } +static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM) { + if (CM) + return *CM; + return CodeModel::Small; +} + AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, Optional<Reloc::Model> RM, - CodeModel::Model CM, + Optional<CodeModel::Model> CM, CodeGenOpt::Level OptLevel) - : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), - FS, Options, getEffectiveRelocModel(RM), CM, OptLevel), - TLOF(createTLOF(getTargetTriple())) { + : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), + FS, Options, getEffectiveRelocModel(RM), + getEffectiveCodeModel(CM), OptLevel), + TLOF(createTLOF(getTargetTriple())) { AS = AMDGPU::getAMDGPUAS(TT); initAsmInfo(); } AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; +bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; + StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { Attribute GPUAttr = F.getFnAttribute("target-cpu"); return GPUAttr.hasAttribute(Attribute::None) ? @@ -288,15 +342,38 @@ static ImmutablePass *createAMDGPUExternalAAWrapperPass() { }); } +/// Predicate for Internalize pass. +static bool mustPreserveGV(const GlobalValue &GV) { + if (const Function *F = dyn_cast<Function>(&GV)) + return F->isDeclaration() || AMDGPU::isEntryFunctionCC(F->getCallingConv()); + + return !GV.use_empty(); +} + void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { Builder.DivergentTarget = true; - bool Internalize = InternalizeSymbols && - (getOptLevel() > CodeGenOpt::None) && - (getTargetTriple().getArch() == Triple::amdgcn); - bool EarlyInline = EarlyInlineAll && - (getOptLevel() > CodeGenOpt::None); - bool AMDGPUAA = EnableAMDGPUAliasAnalysis && getOptLevel() > CodeGenOpt::None; + bool EnableOpt = getOptLevel() > CodeGenOpt::None; + bool Internalize = InternalizeSymbols; + bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableAMDGPUFunctionCalls; + bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt; + bool LibCallSimplify = EnableLibCallSimplify && EnableOpt; + + if (EnableAMDGPUFunctionCalls) { + delete Builder.Inliner; + Builder.Inliner = createAMDGPUFunctionInliningPass(); + } + + if (Internalize) { + // If we're generating code, we always have the whole program available. The + // relocations expected for externally visible functions aren't supported, + // so make sure every non-entry function is hidden. + Builder.addExtension( + PassManagerBuilder::EP_EnabledOnOptLevel0, + [](const PassManagerBuilder &, legacy::PassManagerBase &PM) { + PM.add(createInternalizePass(mustPreserveGV)); + }); + } Builder.addExtension( PassManagerBuilder::EP_ModuleOptimizerEarly, @@ -308,38 +385,25 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { } PM.add(createAMDGPUUnifyMetadataPass()); if (Internalize) { - PM.add(createInternalizePass([=](const GlobalValue &GV) -> bool { - if (const Function *F = dyn_cast<Function>(&GV)) { - if (F->isDeclaration()) - return true; - switch (F->getCallingConv()) { - default: - return false; - case CallingConv::AMDGPU_VS: - case CallingConv::AMDGPU_HS: - case CallingConv::AMDGPU_GS: - case CallingConv::AMDGPU_PS: - case CallingConv::AMDGPU_CS: - case CallingConv::AMDGPU_KERNEL: - case CallingConv::SPIR_KERNEL: - return true; - } - } - return !GV.use_empty(); - })); + PM.add(createInternalizePass(mustPreserveGV)); PM.add(createGlobalDCEPass()); } if (EarlyInline) PM.add(createAMDGPUAlwaysInlinePass(false)); }); + const auto &Opt = Options; Builder.addExtension( PassManagerBuilder::EP_EarlyAsPossible, - [AMDGPUAA](const PassManagerBuilder &, legacy::PassManagerBase &PM) { + [AMDGPUAA, LibCallSimplify, &Opt](const PassManagerBuilder &, + legacy::PassManagerBase &PM) { if (AMDGPUAA) { PM.add(createAMDGPUAAWrapperPass()); PM.add(createAMDGPUExternalAAWrapperPass()); } + PM.add(llvm::createAMDGPUUseNativeCallsPass()); + if (LibCallSimplify) + PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt)); }); Builder.addExtension( @@ -359,8 +423,9 @@ R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, Optional<Reloc::Model> RM, - CodeModel::Model CM, CodeGenOpt::Level OL) - : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) { + Optional<CodeModel::Model> CM, + CodeGenOpt::Level OL, bool JIT) + : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) { setRequiresStructuredCFG(true); } @@ -392,8 +457,9 @@ GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, Optional<Reloc::Model> RM, - CodeModel::Model CM, CodeGenOpt::Level OL) - : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} + Optional<CodeModel::Model> CM, + CodeGenOpt::Level OL, bool JIT) + : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { StringRef GPU = getGPUName(F); @@ -464,6 +530,7 @@ public: } bool addPreISel() override; + bool addInstSelector() override; void addPreRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; @@ -472,7 +539,12 @@ public: class GCNPassConfig final : public AMDGPUPassConfig { public: GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM) - : AMDGPUPassConfig(TM, PM) {} + : AMDGPUPassConfig(TM, PM) { + // It is necessary to know the register usage of the entire call graph. We + // allow calls without EnableAMDGPUFunctionCalls if they are marked + // noinline, so this is always required. + setRequiresCodeGenSCCOrder(true); + } GCNTargetMachine &getGCNTargetMachine() const { return getTM<GCNTargetMachine>(); @@ -485,12 +557,10 @@ public: void addMachineSSAOptimization() override; bool addILPOpts() override; bool addInstSelector() override; -#ifdef LLVM_BUILD_GLOBAL_ISEL bool addIRTranslator() override; bool addLegalizeMachineIR() override; bool addRegBankSelect() override; bool addGlobalInstructionSelect() override; -#endif void addFastRegAlloc(FunctionPass *RegAllocPass) override; void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; void addPreRegAlloc() override; @@ -540,15 +610,18 @@ void AMDGPUPassConfig::addIRPasses() { addPass(createAMDGPULowerIntrinsicsPass()); - // Function calls are not supported, so make sure we inline everything. - addPass(createAMDGPUAlwaysInlinePass()); - addPass(createAlwaysInlinerLegacyPass()); - // We need to add the barrier noop pass, otherwise adding the function - // inlining pass will cause all of the PassConfigs passes to be run - // one function at a time, which means if we have a nodule with two - // functions, then we will generate code for the first function - // without ever running any passes on the second. - addPass(createBarrierNoopPass()); + if (TM.getTargetTriple().getArch() == Triple::r600 || + !EnableAMDGPUFunctionCalls) { + // Function calls are not supported, so make sure we inline everything. + addPass(createAMDGPUAlwaysInlinePass()); + addPass(createAlwaysInlinerLegacyPass()); + // We need to add the barrier noop pass, otherwise adding the function + // inlining pass will cause all of the PassConfigs passes to be run + // one function at a time, which means if we have a nodule with two + // functions, then we will generate code for the first function + // without ever running any passes on the second. + addPass(createBarrierNoopPass()); + } if (TM.getTargetTriple().getArch() == Triple::amdgcn) { // TODO: May want to move later or split into an early and late one. @@ -559,6 +632,9 @@ void AMDGPUPassConfig::addIRPasses() { // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. addPass(createAMDGPUOpenCLImageTypeLoweringPass()); + // Replace OpenCL enqueued block function pointers with global variables. + addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); + if (TM.getOptLevel() > CodeGenOpt::None) { addPass(createInferAddressSpacesPass()); addPass(createAMDGPUPromoteAlloca()); @@ -609,7 +685,7 @@ bool AMDGPUPassConfig::addPreISel() { } bool AMDGPUPassConfig::addInstSelector() { - addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel())); + addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel())); return false; } @@ -630,6 +706,11 @@ bool R600PassConfig::addPreISel() { return false; } +bool R600PassConfig::addInstSelector() { + addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel())); + return false; +} + void R600PassConfig::addPreRegAlloc() { addPass(createR600VectorRegMerger()); } @@ -725,7 +806,6 @@ bool GCNPassConfig::addInstSelector() { return false; } -#ifdef LLVM_BUILD_GLOBAL_ISEL bool GCNPassConfig::addIRTranslator() { addPass(new IRTranslator()); return false; @@ -746,8 +826,6 @@ bool GCNPassConfig::addGlobalInstructionSelect() { return false; } -#endif - void GCNPassConfig::addPreRegAlloc() { if (LateCFGStructurize) { addPass(createAMDGPUMachineCFGStructurizerPass()); @@ -764,19 +842,25 @@ void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { // SI_ELSE will introduce a copy of the tied operand source after the else. insertPass(&PHIEliminationID, &SILowerControlFlowID, false); + // This must be run after SILowerControlFlow, since it needs to use the + // machine-level CFG, but before register allocation. + insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false); + TargetPassConfig::addFastRegAlloc(RegAllocPass); } void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { - // This needs to be run directly before register allocation because earlier - // passes might recompute live intervals. - insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); + insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID); // This must be run immediately after phi elimination and before // TwoAddressInstructions, otherwise the processing of the tied operand of // SI_ELSE will introduce a copy of the tied operand source after the else. insertPass(&PHIEliminationID, &SILowerControlFlowID, false); + // This must be run after SILowerControlFlow, since it needs to use the + // machine-level CFG, but before register allocation. + insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false); + TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); } @@ -806,6 +890,7 @@ void GCNPassConfig::addPreEmitPass() { addPass(createSIInsertWaitsPass()); addPass(createSIShrinkInstructionsPass()); addPass(&SIInsertSkipsPassID); + addPass(createSIMemoryLegalizerPass()); addPass(createSIDebuggerInsertNopsPass()); addPass(&BranchRelaxationPassID); } diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h index a3c7c1982d0a..5043e31f6f5b 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -41,9 +41,11 @@ protected: StringRef getFeatureString(const Function &F) const; public: + static bool EnableLateStructurizeCFG; + AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, - Optional<Reloc::Model> RM, CodeModel::Model CM, + Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM, CodeGenOpt::Level OL); ~AMDGPUTargetMachine() override; @@ -82,8 +84,8 @@ private: public: R600TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, - Optional<Reloc::Model> RM, CodeModel::Model CM, - CodeGenOpt::Level OL); + Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM, + CodeGenOpt::Level OL, bool JIT); TargetPassConfig *createPassConfig(PassManagerBase &PM) override; @@ -105,12 +107,16 @@ private: public: GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, - Optional<Reloc::Model> RM, CodeModel::Model CM, - CodeGenOpt::Level OL); + Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM, + CodeGenOpt::Level OL, bool JIT); TargetPassConfig *createPassConfig(PassManagerBase &PM) override; const SISubtarget *getSubtargetImpl(const Function &) const override; + + bool useIPRA() const override { + return true; + } }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp index 6c1885e67fcb..e2f718bd3c34 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp @@ -23,8 +23,7 @@ using namespace llvm; MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal( const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { - auto AS = static_cast<const AMDGPUTargetMachine*>(&TM)->getAMDGPUAS(); - if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GO, AS) && + if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GO) && AMDGPU::shouldEmitConstantsToTextSection(TM.getTargetTriple())) return TextSection; diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 89a03902dc69..77c2d4b956c6 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1,4 +1,4 @@ -//===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===// +//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===// // // The LLVM Compiler Infrastructure // @@ -16,15 +16,40 @@ //===----------------------------------------------------------------------===// #include "AMDGPUTargetTransformInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/CodeGen/BasicTTIImpl.h" -#include "llvm/IR/Intrinsics.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/MC/SubtargetFeature.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/CostTable.h" -#include "llvm/Target/TargetLowering.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include <algorithm> +#include <cassert> +#include <limits> +#include <utility> + using namespace llvm; #define DEBUG_TYPE "AMDGPUtti" @@ -54,7 +79,7 @@ static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, if (!L->contains(I)) continue; if (const PHINode *PHI = dyn_cast<PHINode>(V)) { - if (none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) { + if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) { return SubLoop->contains(PHI); })) return true; } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1)) @@ -66,7 +91,7 @@ static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { UP.Threshold = 300; // Twice the default. - UP.MaxCount = UINT_MAX; + UP.MaxCount = std::numeric_limits<unsigned>::max(); UP.Partial = true; // TODO: Do we want runtime unrolling? @@ -81,12 +106,11 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, const DataLayout &DL = BB->getModule()->getDataLayout(); unsigned LocalGEPsSeen = 0; - if (any_of(L->getSubLoops(), [BB](const Loop* SubLoop) { + if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) { return SubLoop->contains(BB); })) continue; // Block belongs to an inner loop. for (const Instruction &I : *BB) { - // Unroll a loop which contains an "if" statement whose condition // defined by a PHI belonging to the loop. This may help to eliminate // if region and potentially even PHI itself, saving on both divergence @@ -153,7 +177,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, if (!Inst || L->isLoopInvariant(Op)) continue; - if (any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) { + if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) { return SubLoop->contains(Inst); })) continue; HasLoopDef = true; @@ -264,11 +288,36 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { return 8; } +bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, + MemIntrinsicInfo &Info) const { + switch (Inst->getIntrinsicID()) { + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: { + auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2)); + auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4)); + if (!Ordering || !Volatile) + return false; // Invalid. + + unsigned OrderingVal = Ordering->getZExtValue(); + if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent)) + return false; + + Info.PtrVal = Inst->getArgOperand(0); + Info.Ordering = static_cast<AtomicOrdering>(OrderingVal); + Info.ReadMem = true; + Info.WriteMem = true; + Info.IsVolatile = !Volatile->isNullValue(); + return true; + } + default: + return false; + } +} + int AMDGPUTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) { - EVT OrigTy = TLI->getValueType(DL, Ty); if (!OrigTy.isSimple()) { return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, @@ -289,25 +338,23 @@ int AMDGPUTTIImpl::getArithmeticInstrCost( switch (ISD) { case ISD::SHL: case ISD::SRL: - case ISD::SRA: { + case ISD::SRA: if (SLT == MVT::i64) return get64BitInstrCost() * LT.first * NElts; // i32 return getFullRateInstrCost() * LT.first * NElts; - } case ISD::ADD: case ISD::SUB: case ISD::AND: case ISD::OR: - case ISD::XOR: { + case ISD::XOR: if (SLT == MVT::i64){ // and, or and xor are typically split into 2 VALU instructions. return 2 * getFullRateInstrCost() * LT.first * NElts; } return LT.first * NElts * getFullRateInstrCost(); - } case ISD::MUL: { const int QuarterRateCost = getQuarterRateInstrCost(); if (SLT == MVT::i64) { @@ -327,14 +374,12 @@ int AMDGPUTTIImpl::getArithmeticInstrCost( if (SLT == MVT::f32 || SLT == MVT::f16) return LT.first * NElts * getFullRateInstrCost(); break; - case ISD::FDIV: case ISD::FREM: // FIXME: frem should be handled separately. The fdiv in it is most of it, // but the current lowering is also not entirely correct. if (SLT == MVT::f64) { int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost(); - // Add cost of workaround. if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) Cost += 3 * getFullRateInstrCost(); @@ -342,13 +387,34 @@ int AMDGPUTTIImpl::getArithmeticInstrCost( return LT.first * Cost * NElts; } - // Assuming no fp32 denormals lowering. + if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) { + // TODO: This is more complicated, unsafe flags etc. + if ((SLT == MVT::f32 && !ST->hasFP32Denormals()) || + (SLT == MVT::f16 && ST->has16BitInsts())) { + return LT.first * getQuarterRateInstrCost() * NElts; + } + } + + if (SLT == MVT::f16 && ST->has16BitInsts()) { + // 2 x v_cvt_f32_f16 + // f32 rcp + // f32 fmul + // v_cvt_f16_f32 + // f16 div_fixup + int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(); + return LT.first * Cost * NElts; + } + if (SLT == MVT::f32 || SLT == MVT::f16) { - assert(!ST->hasFP32Denormals() && "will change when supported"); int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost(); + + if (!ST->hasFP32Denormals()) { + // FP mode switches. + Cost += 2 * getFullRateInstrCost(); + } + return LT.first * NElts * Cost; } - break; default: break; @@ -451,7 +517,9 @@ static bool isArgPassedInSGPR(const Argument *A) { case CallingConv::SPIR_KERNEL: return true; case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_LS: case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_ES: case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: @@ -465,11 +533,9 @@ static bool isArgPassedInSGPR(const Argument *A) { } } -/// /// \returns true if the result of the value could potentially be /// different across workitems in a wavefront. bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { - if (const Argument *A = dyn_cast<Argument>(V)) return !isArgPassedInSGPR(A); @@ -534,3 +600,16 @@ unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Inde return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } + +bool AMDGPUTTIImpl::areInlineCompatible(const Function *Caller, + const Function *Callee) const { + const TargetMachine &TM = getTLI()->getTargetMachine(); + const FeatureBitset &CallerBits = + TM.getSubtargetImpl(*Caller)->getFeatureBits(); + const FeatureBitset &CalleeBits = + TM.getSubtargetImpl(*Callee)->getFeatureBits(); + + FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; + FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; + return ((RealCallerBits & RealCalleeBits) == RealCalleeBits); +} diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 9a320bdfcc3d..8899d2c6da8a 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -1,4 +1,4 @@ -//===-- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI -------*- C++ -*-===// +//===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -6,38 +6,76 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +// /// \file /// This file a TargetTransformInfo::Concept conforming object specific to the /// AMDGPU target machine. It uses the target's detailed information to /// provide more precise answers to certain TTI queries, while letting the /// target independent and default TTI implementations handle the rest. -/// +// //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/IR/Function.h" +#include "llvm/MC/SubtargetFeature.h" +#include "llvm/Support/MathExtras.h" +#include <cassert> namespace llvm { + class AMDGPUTargetLowering; +class Loop; +class ScalarEvolution; +class Type; +class Value; class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { - typedef BasicTTIImplBase<AMDGPUTTIImpl> BaseT; - typedef TargetTransformInfo TTI; + using BaseT = BasicTTIImplBase<AMDGPUTTIImpl>; + using TTI = TargetTransformInfo; + friend BaseT; const AMDGPUSubtarget *ST; const AMDGPUTargetLowering *TLI; bool IsGraphicsShader; + const FeatureBitset InlineFeatureIgnoreList = { + // Codegen control options which don't matter. + AMDGPU::FeatureEnableLoadStoreOpt, + AMDGPU::FeatureEnableSIScheduler, + AMDGPU::FeatureEnableUnsafeDSOffsetFolding, + AMDGPU::FeatureFlatForGlobal, + AMDGPU::FeaturePromoteAlloca, + AMDGPU::FeatureUnalignedBufferAccess, + AMDGPU::FeatureUnalignedScratchAccess, + + AMDGPU::FeatureAutoWaitcntBeforeBarrier, + AMDGPU::FeatureDebuggerEmitPrologue, + AMDGPU::FeatureDebuggerInsertNops, + AMDGPU::FeatureDebuggerReserveRegs, + + // Property of the kernel/environment which can't actually differ. + AMDGPU::FeatureSGPRInitBug, + AMDGPU::FeatureXNACK, + AMDGPU::FeatureTrapHandler, + + // Perf-tuning features + AMDGPU::FeatureFastFMAF32, + AMDGPU::HalfRate64Ops + }; + const AMDGPUSubtarget *getST() const { return ST; } const AMDGPUTargetLowering *getTLI() const { return TLI; } - static inline int getFullRateInstrCost() { return TargetTransformInfo::TCC_Basic; } @@ -78,7 +116,7 @@ public: unsigned getHardwareNumberOfRegisters(bool Vector) const; unsigned getNumberOfRegisters(bool Vector) const; - unsigned getRegisterBitWidth(bool Vector) const ; + unsigned getRegisterBitWidth(bool Vector) const; unsigned getMinVectorRegisterBitWidth() const; unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; @@ -94,6 +132,8 @@ public: unsigned getMaxInterleaveFactor(unsigned VF); + bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const; + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, @@ -121,8 +161,13 @@ public: unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + + bool areInlineCompatible(const Function *Caller, + const Function *Callee) const; + + unsigned getInliningThresholdMultiplier() { return 9; } }; } // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H diff --git a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 309913f87fb6..6107f3a7dd18 100644 --- a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -21,18 +21,26 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Type.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" + using namespace llvm; #define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes" @@ -42,6 +50,7 @@ namespace { class AMDGPUUnifyDivergentExitNodes : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid + AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) { initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry()); } @@ -51,9 +60,12 @@ public: bool runOnFunction(Function &F) override; }; -} +} // end anonymous namespace char AMDGPUUnifyDivergentExitNodes::ID = 0; + +char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID; + INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, "Unify divergent function exit nodes", false, false) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) @@ -61,8 +73,6 @@ INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE, "Unify divergent function exit nodes", false, false) -char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID; - void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{ // TODO: Preserve dominator tree. AU.addRequired<PostDominatorTreeWrapperPass>(); @@ -113,7 +123,6 @@ static BasicBlock *unifyReturnBlockSet(Function &F, // Otherwise, we need to insert a new basic block into the function, add a PHI // nodes (if the function returns values), and convert all of the return // instructions into unconditional branches. - // BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F); PHINode *PN = nullptr; @@ -129,7 +138,6 @@ static BasicBlock *unifyReturnBlockSet(Function &F, // Loop over all of the blocks, replacing the return instruction with an // unconditional branch. - // for (BasicBlock *BB : ReturningBlocks) { // Add an incoming element to the PHI node for every return instruction that // is merging into this new block... @@ -142,7 +150,7 @@ static BasicBlock *unifyReturnBlockSet(Function &F, for (BasicBlock *BB : ReturningBlocks) { // Cleanup possible branch to unconditional branch to the return. - SimplifyCFG(BB, TTI, 2); + simplifyCFG(BB, TTI, {2}); } return NewRetBlock; @@ -157,7 +165,6 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) { // Loop over all of the blocks in a function, tracking all of the blocks that // return. - // SmallVector<BasicBlock *, 4> ReturningBlocks; SmallVector<BasicBlock *, 4> UnreachableBlocks; diff --git a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp index 3a0c3ede08f4..b78568e89cfb 100644 --- a/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp +++ b/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp @@ -1,4 +1,4 @@ -//===-- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata -------------------===// +//===- AMDGPUUnifyMetadata.cpp - Unify OpenCL metadata --------------------===// // // The LLVM Compiler Infrastructure // @@ -16,7 +16,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/Function.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include <algorithm> @@ -41,10 +41,11 @@ namespace { class AMDGPUUnifyMetadata : public ModulePass { public: static char ID; - explicit AMDGPUUnifyMetadata() : ModulePass(ID) {}; + + explicit AMDGPUUnifyMetadata() : ModulePass(ID) {} private: - virtual bool runOnModule(Module &M); + bool runOnModule(Module &M) override; /// \brief Unify version metadata. /// \return true if changes are made. diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index 1a393845a822..0a0e43123ae0 100644 --- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -1,11 +1,10 @@ -//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===// +//===- AMDILCFGStructurizer.cpp - CFG Structurizer ------------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // -/// \file //==-----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -67,7 +66,7 @@ STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions"); namespace llvm { - void initializeAMDGPUCFGStructurizerPass(PassRegistry&); +void initializeAMDGPUCFGStructurizerPass(PassRegistry &); } // end namespace llvm @@ -121,9 +120,9 @@ public: class AMDGPUCFGStructurizer : public MachineFunctionPass { public: - typedef SmallVector<MachineBasicBlock *, 32> MBBVector; - typedef std::map<MachineBasicBlock *, BlockInformation *> MBBInfoMap; - typedef std::map<MachineLoop *, MachineBasicBlock *> LoopLandInfoMap; + using MBBVector = SmallVector<MachineBasicBlock *, 32>; + using MBBInfoMap = std::map<MachineBasicBlock *, BlockInformation *>; + using LoopLandInfoMap = std::map<MachineLoop *, MachineBasicBlock *>; enum PathToKind { Not_SinglePath = 0, @@ -234,6 +233,7 @@ protected: void insertCondBranchBefore(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, int NewOpcode, int RegNum, const DebugLoc &DL); + static int getBranchNzeroOpcode(int OldOpcode); static int getBranchZeroOpcode(int OldOpcode); static int getContinueNzeroOpcode(int OldOpcode); @@ -246,21 +246,25 @@ protected: static bool isUncondBranch(MachineInstr *MI); static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB); static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB); + /// The correct naming for this is getPossibleLoopendBlockBranchInstr. /// /// BB with backward-edge could have move instructions after the branch /// instruction. Such move instruction "belong to" the loop backward-edge. MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB); + static MachineInstr *getReturnInstr(MachineBasicBlock *MBB); static bool isReturnBlock(MachineBasicBlock *MBB); static void cloneSuccessorList(MachineBasicBlock *DstMBB, - MachineBasicBlock *SrcMBB) ; + MachineBasicBlock *SrcMBB); static MachineBasicBlock *clone(MachineBasicBlock *MBB); + /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose /// because the AMDGPU instruction is not recognized as terminator fix this /// and retire this routine void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk); + static void wrapup(MachineBasicBlock *MBB); int patternMatch(MachineBasicBlock *MBB); @@ -299,6 +303,7 @@ protected: MachineBasicBlock *LandMBB); void settleLoopcontBlock(MachineBasicBlock *ContingMBB, MachineBasicBlock *ContMBB); + /// normalizeInfiniteLoopExit change /// B1: /// uncond_br LoopHeader @@ -309,6 +314,7 @@ protected: /// and return the newly added dummy exit block MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep); void removeUnconditionalBranch(MachineBasicBlock *MBB); + /// Remove duplicate branches instructions in a block. /// For instance /// B0: @@ -318,6 +324,7 @@ protected: /// B0: /// cond_br X B1 B2 void removeRedundantConditionalBranch(MachineBasicBlock *MBB); + void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB); void removeSuccessor(MachineBasicBlock *MBB); MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB, @@ -335,10 +342,10 @@ private: SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks; }; -char AMDGPUCFGStructurizer::ID = 0; - } // end anonymous namespace +char AMDGPUCFGStructurizer::ID = 0; + int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const { MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB); if (It == BlockInfoMap.end()) @@ -535,7 +542,7 @@ int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) { case AMDGPU::JUMP_COND: case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32; default: llvm_unreachable("internal error"); - }; + } return -1; } @@ -1168,6 +1175,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB, return Num; } +#ifndef NDEBUG void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf( MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) { @@ -1209,6 +1217,7 @@ void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf( dbgs() << "\n"; } +#endif int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, @@ -1595,7 +1604,7 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB, MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB); if (!BranchMI) { DEBUG( - dbgs() << "migrateInstruction don't see branch instr\n" ; + dbgs() << "migrateInstruction don't see branch instr\n"; ); SpliceEnd = SrcMBB->end(); } else { @@ -1632,7 +1641,7 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) { FuncRep->push_back(DummyExitBlk); //insert to function SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: "); DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";); - LLVMContext &Ctx = LoopHeader->getParent()->getFunction()->getContext(); + LLVMContext &Ctx = LoopHeader->getParent()->getFunction().getContext(); Ctx.emitError("Extra register needed to handle CFG"); return nullptr; } diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index b37c274102bc..2acd7f78faea 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1,4 +1,4 @@ -//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ---------===// +//===- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===// // // The LLVM Compiler Infrastructure // @@ -7,6 +7,7 @@ // //===----------------------------------------------------------------------===// +#include "AMDGPU.h" #include "AMDKernelCodeT.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "MCTargetDesc/AMDGPUTargetStreamer.h" @@ -40,7 +41,9 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/AMDGPUMetadata.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/SMLoc.h" @@ -83,7 +86,7 @@ public: AMDGPUOperand(KindTy Kind_, const AMDGPUAsmParser *AsmParser_) : MCParsedAsmOperand(), Kind(Kind_), AsmParser(AsmParser_) {} - typedef std::unique_ptr<AMDGPUOperand> Ptr; + using Ptr = std::unique_ptr<AMDGPUOperand>; struct Modifiers { bool Abs = false; @@ -129,6 +132,7 @@ public: ImmTyIdxen, ImmTyAddr64, ImmTyOffset, + ImmTyInstOffset, ImmTyOffset0, ImmTyOffset1, ImmTyGLC, @@ -164,7 +168,8 @@ public: ImmTyOpSelHi, ImmTyNegLo, ImmTyNegHi, - ImmTySwizzle + ImmTySwizzle, + ImmTyHigh }; struct TokOp { @@ -290,8 +295,8 @@ public: bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<16>(getImm()); } bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); } - bool isOffsetU12() const { return isImmTy(ImmTyOffset) && isUInt<12>(getImm()); } - bool isOffsetS13() const { return isImmTy(ImmTyOffset) && isInt<13>(getImm()); } + bool isOffsetU12() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isUInt<12>(getImm()); } + bool isOffsetS13() const { return (isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset)) && isInt<13>(getImm()); } bool isGDS() const { return isImmTy(ImmTyGDS); } bool isGLC() const { return isImmTy(ImmTyGLC); } bool isSLC() const { return isImmTy(ImmTySLC); } @@ -312,6 +317,7 @@ public: bool isOpSelHi() const { return isImmTy(ImmTyOpSelHi); } bool isNegLo() const { return isImmTy(ImmTyNegLo); } bool isNegHi() const { return isImmTy(ImmTyNegHi); } + bool isHigh() const { return isImmTy(ImmTyHigh); } bool isMod() const { return isClampSI() || isOModSI(); @@ -637,6 +643,7 @@ public: case ImmTyIdxen: OS << "Idxen"; break; case ImmTyAddr64: OS << "Addr64"; break; case ImmTyOffset: OS << "Offset"; break; + case ImmTyInstOffset: OS << "InstOffset"; break; case ImmTyOffset0: OS << "Offset0"; break; case ImmTyOffset1: OS << "Offset1"; break; case ImmTyGLC: OS << "GLC"; break; @@ -673,6 +680,7 @@ public: case ImmTyNegLo: OS << "NegLo"; break; case ImmTyNegHi: OS << "NegHi"; break; case ImmTySwizzle: OS << "Swizzle"; break; + case ImmTyHigh: OS << "High"; break; } } @@ -801,7 +809,6 @@ public: }; class AMDGPUAsmParser : public MCTargetAsmParser { - const MCInstrInfo &MII; MCAsmParser &Parser; unsigned ForcedEncodingSize = 0; @@ -822,11 +829,15 @@ private: bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor); bool ParseDirectiveHSACodeObjectVersion(); bool ParseDirectiveHSACodeObjectISA(); - bool ParseDirectiveCodeObjectMetadata(); bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header); bool ParseDirectiveAMDKernelCodeT(); bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const; bool ParseDirectiveAMDGPUHsaKernel(); + + bool ParseDirectiveISAVersion(); + bool ParseDirectiveHSAMetadata(); + bool ParseDirectivePALMetadata(); + bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum); @@ -843,12 +854,12 @@ public: Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY }; - typedef std::map<AMDGPUOperand::ImmTy, unsigned> OptionalImmIndexMap; + using OptionalImmIndexMap = std::map<AMDGPUOperand::ImmTy, unsigned>; AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser) { + : MCTargetAsmParser(Options, STI, MII), Parser(_Parser) { MCAsmParserExtension::Initialize(Parser); if (getFeatureBits().none()) { @@ -905,6 +916,10 @@ public: return !isVI(); } + bool hasIntClamp() const { + return getFeatureBits()[AMDGPU::FeatureIntClamp]; + } + AMDGPUTargetStreamer &getTargetStreamer() { MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer(); return static_cast<AMDGPUTargetStreamer &>(TS); @@ -991,8 +1006,9 @@ public: private: struct OperandInfoTy { int64_t Id; - bool IsSymbolic; - OperandInfoTy(int64_t Id_) : Id(Id_), IsSymbolic(false) { } + bool IsSymbolic = false; + + OperandInfoTy(int64_t Id_) : Id(Id_) {} }; bool parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId); @@ -1004,6 +1020,7 @@ private: bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc); bool validateConstantBusLimitations(const MCInst &Inst); bool validateEarlyClobberLimitations(const MCInst &Inst); + bool validateIntClampSupported(const MCInst &Inst); bool usesConstantBus(const MCInst &Inst, unsigned OpIdx); bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const; unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const; @@ -1060,9 +1077,12 @@ public: void cvtVOP3(MCInst &Inst, const OperandVector &Operands, OptionalImmIndexMap &OptionalIdx); + void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands); void cvtVOP3(MCInst &Inst, const OperandVector &Operands); void cvtVOP3P(MCInst &Inst, const OperandVector &Operands); + void cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands); + void cvtMIMG(MCInst &Inst, const OperandVector &Operands, bool IsAtomic = false); void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands); @@ -1279,7 +1299,6 @@ uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const } void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const { - if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()), Inst.getNumOperands())) { addLiteralImmOperand(Inst, Imm.Val, @@ -1311,7 +1330,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: - case AMDGPU::OPERAND_REG_INLINE_C_FP64: { + case AMDGPU::OPERAND_REG_INLINE_C_FP64: if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(), AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Literal.getZExtValue())); @@ -1335,7 +1354,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo // unclear how we should encode them. This case should be checked earlier // in predicate methods (isLiteralImm()) llvm_unreachable("fp literal in 64-bit integer instruction."); - } + case AMDGPU::OPERAND_REG_IMM_INT32: case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: @@ -1377,7 +1396,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo case AMDGPU::OPERAND_REG_IMM_INT32: case AMDGPU::OPERAND_REG_IMM_FP32: case AMDGPU::OPERAND_REG_INLINE_C_INT32: - case AMDGPU::OPERAND_REG_INLINE_C_FP32: { + case AMDGPU::OPERAND_REG_INLINE_C_FP32: if (isInt<32>(Val) && AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val), AsmParser->hasInv2PiInlineImm())) { @@ -1387,11 +1406,11 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo Inst.addOperand(MCOperand::createImm(Val & 0xffffffff)); return; - } + case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: - case AMDGPU::OPERAND_REG_INLINE_C_FP64: { + case AMDGPU::OPERAND_REG_INLINE_C_FP64: if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Val)); return; @@ -1399,11 +1418,11 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo Inst.addOperand(MCOperand::createImm(Lo_32(Val))); return; - } + case AMDGPU::OPERAND_REG_IMM_INT16: case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: { + case AMDGPU::OPERAND_REG_INLINE_C_FP16: if (isInt<16>(Val) && AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val), AsmParser->hasInv2PiInlineImm())) { @@ -1413,7 +1432,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo Inst.addOperand(MCOperand::createImm(Val & 0xffff)); return; - } + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { auto LiteralVal = static_cast<uint16_t>(Literal.getLoBits(16).getZExtValue()); @@ -1711,7 +1730,6 @@ AMDGPUAsmParser::parseAbsoluteExpr(int64_t &Val, bool AbsMod) { if (AbsMod && getLexer().peekTok().is(AsmToken::Pipe) && (getLexer().getKind() == AsmToken::Integer || getLexer().getKind() == AsmToken::Real)) { - // This is a workaround for handling operands like these: // |1.0| // |-1| @@ -2111,7 +2129,6 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) { SIInstrFlags::VOP1 | SIInstrFlags::VOP2 | SIInstrFlags::VOP3 | SIInstrFlags::VOP3P | SIInstrFlags::SDWA)) { - // Check special imm operands (used by madmk, etc) if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) { ++ConstantBusUseCount; @@ -2156,7 +2173,6 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) { } bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) { - const unsigned Opcode = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Opcode); @@ -2193,6 +2209,20 @@ bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) { return true; } +bool AMDGPUAsmParser::validateIntClampSupported(const MCInst &Inst) { + + const unsigned Opc = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opc); + + if ((Desc.TSFlags & SIInstrFlags::IntClamp) != 0 && !hasIntClamp()) { + int ClampIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp); + assert(ClampIdx != -1); + return Inst.getOperand(ClampIdx).getImm() == 0; + } + + return true; +} + bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, const SMLoc &IDLoc) { if (!validateConstantBusLimitations(Inst)) { @@ -2205,6 +2235,11 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, "destination must be different than all sources"); return false; } + if (!validateIntClampSupported(Inst)) { + Error(IDLoc, + "integer clamping is not supported on this GPU"); + return false; + } return true; } @@ -2365,49 +2400,6 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { return false; } -bool AMDGPUAsmParser::ParseDirectiveCodeObjectMetadata() { - std::string YamlString; - raw_string_ostream YamlStream(YamlString); - - getLexer().setSkipSpace(false); - - bool FoundEnd = false; - while (!getLexer().is(AsmToken::Eof)) { - while (getLexer().is(AsmToken::Space)) { - YamlStream << getLexer().getTok().getString(); - Lex(); - } - - if (getLexer().is(AsmToken::Identifier)) { - StringRef ID = getLexer().getTok().getIdentifier(); - if (ID == AMDGPU::CodeObject::MetadataAssemblerDirectiveEnd) { - Lex(); - FoundEnd = true; - break; - } - } - - YamlStream << Parser.parseStringToEndOfStatement() - << getContext().getAsmInfo()->getSeparatorString(); - - Parser.eatToEndOfStatement(); - } - - getLexer().setSkipSpace(true); - - if (getLexer().is(AsmToken::Eof) && !FoundEnd) { - return TokError( - "expected directive .end_amdgpu_code_object_metadata not found"); - } - - YamlStream.flush(); - - if (!getTargetStreamer().EmitCodeObjectMetadata(YamlString)) - return Error(getParser().getTok().getLoc(), "invalid code object metadata"); - - return false; -} - bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header) { SmallString<40> ErrStr; @@ -2460,6 +2452,103 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() { return false; } +bool AMDGPUAsmParser::ParseDirectiveISAVersion() { + if (getSTI().getTargetTriple().getArch() != Triple::amdgcn) { + return Error(getParser().getTok().getLoc(), + ".amd_amdgpu_isa directive is not available on non-amdgcn " + "architectures"); + } + + auto ISAVersionStringFromASM = getLexer().getTok().getStringContents(); + + std::string ISAVersionStringFromSTI; + raw_string_ostream ISAVersionStreamFromSTI(ISAVersionStringFromSTI); + IsaInfo::streamIsaVersion(&getSTI(), ISAVersionStreamFromSTI); + + if (ISAVersionStringFromASM != ISAVersionStreamFromSTI.str()) { + return Error(getParser().getTok().getLoc(), + ".amd_amdgpu_isa directive does not match triple and/or mcpu " + "arguments specified through the command line"); + } + + getTargetStreamer().EmitISAVersion(ISAVersionStreamFromSTI.str()); + Lex(); + + return false; +} + +bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() { + if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA) { + return Error(getParser().getTok().getLoc(), + (Twine(HSAMD::AssemblerDirectiveBegin) + Twine(" directive is " + "not available on non-amdhsa OSes")).str()); + } + + std::string HSAMetadataString; + raw_string_ostream YamlStream(HSAMetadataString); + + getLexer().setSkipSpace(false); + + bool FoundEnd = false; + while (!getLexer().is(AsmToken::Eof)) { + while (getLexer().is(AsmToken::Space)) { + YamlStream << getLexer().getTok().getString(); + Lex(); + } + + if (getLexer().is(AsmToken::Identifier)) { + StringRef ID = getLexer().getTok().getIdentifier(); + if (ID == AMDGPU::HSAMD::AssemblerDirectiveEnd) { + Lex(); + FoundEnd = true; + break; + } + } + + YamlStream << Parser.parseStringToEndOfStatement() + << getContext().getAsmInfo()->getSeparatorString(); + + Parser.eatToEndOfStatement(); + } + + getLexer().setSkipSpace(true); + + if (getLexer().is(AsmToken::Eof) && !FoundEnd) { + return TokError(Twine("expected directive ") + + Twine(HSAMD::AssemblerDirectiveEnd) + Twine(" not found")); + } + + YamlStream.flush(); + + if (!getTargetStreamer().EmitHSAMetadata(HSAMetadataString)) + return Error(getParser().getTok().getLoc(), "invalid HSA metadata"); + + return false; +} + +bool AMDGPUAsmParser::ParseDirectivePALMetadata() { + if (getSTI().getTargetTriple().getOS() != Triple::AMDPAL) { + return Error(getParser().getTok().getLoc(), + (Twine(PALMD::AssemblerDirective) + Twine(" directive is " + "not available on non-amdpal OSes")).str()); + } + + PALMD::Metadata PALMetadata; + for (;;) { + uint32_t Value; + if (ParseAsAbsoluteExpression(Value)) { + return TokError(Twine("invalid value in ") + + Twine(PALMD::AssemblerDirective)); + } + PALMetadata.push_back(Value); + if (getLexer().isNot(AsmToken::Comma)) + break; + Lex(); + } + getTargetStreamer().EmitPALMetadata(PALMetadata); + return false; +} + bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getString(); @@ -2469,20 +2558,45 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".hsa_code_object_isa") return ParseDirectiveHSACodeObjectISA(); - if (IDVal == AMDGPU::CodeObject::MetadataAssemblerDirectiveBegin) - return ParseDirectiveCodeObjectMetadata(); - if (IDVal == ".amd_kernel_code_t") return ParseDirectiveAMDKernelCodeT(); if (IDVal == ".amdgpu_hsa_kernel") return ParseDirectiveAMDGPUHsaKernel(); + if (IDVal == ".amd_amdgpu_isa") + return ParseDirectiveISAVersion(); + + if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin) + return ParseDirectiveHSAMetadata(); + + if (IDVal == PALMD::AssemblerDirective) + return ParseDirectivePALMetadata(); + return true; } bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const { + + for (MCRegAliasIterator R(AMDGPU::TTMP12_TTMP13_TTMP14_TTMP15, &MRI, true); + R.isValid(); ++R) { + if (*R == RegNo) + return isGFX9(); + } + + switch (RegNo) { + case AMDGPU::TBA: + case AMDGPU::TBA_LO: + case AMDGPU::TBA_HI: + case AMDGPU::TMA: + case AMDGPU::TMA_LO: + case AMDGPU::TMA_HI: + return !isGFX9(); + default: + break; + } + if (isCI()) return true; @@ -2529,24 +2643,22 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { if (ResTy == MatchOperand_Success) return ResTy; - if (getLexer().getKind() == AsmToken::Identifier) { - // If this identifier is a symbol, we want to create an expression for it. - // It is a little difficult to distinguish between a symbol name, and - // an instruction flag like 'gds'. In order to do this, we parse - // all tokens as expressions and then treate the symbol name as the token - // string when we want to interpret the operand as a token. - const auto &Tok = Parser.getTok(); - SMLoc S = Tok.getLoc(); - const MCExpr *Expr = nullptr; - if (!Parser.parseExpression(Expr)) { - Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S)); - return MatchOperand_Success; - } + const auto &Tok = Parser.getTok(); + SMLoc S = Tok.getLoc(); + + const MCExpr *Expr = nullptr; + if (!Parser.parseExpression(Expr)) { + Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S)); + return MatchOperand_Success; + } - Operands.push_back(AMDGPUOperand::CreateToken(this, Tok.getString(), Tok.getLoc())); + // Possibly this is an instruction flag like 'gds'. + if (Tok.getKind() == AsmToken::Identifier) { + Operands.push_back(AMDGPUOperand::CreateToken(this, Tok.getString(), S)); Parser.Lex(); return MatchOperand_Success; } + return MatchOperand_NoMatch; } @@ -2688,7 +2800,7 @@ OperandMatchResultTy AMDGPUAsmParser::parseOperandArrayWithPrefix( // FIXME: How to verify the number of elements matches the number of src // operands? - for (int I = 0; I < 3; ++I) { + for (int I = 0; I < 4; ++I) { if (I != 0) { if (getLexer().is(AsmToken::RBrac)) break; @@ -4016,11 +4128,13 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr}, {"gds", AMDGPUOperand::ImmTyGDS, true, nullptr}, {"offset", AMDGPUOperand::ImmTyOffset, false, nullptr}, + {"inst_offset", AMDGPUOperand::ImmTyInstOffset, false, nullptr}, {"dfmt", AMDGPUOperand::ImmTyDFMT, false, nullptr}, {"nfmt", AMDGPUOperand::ImmTyNFMT, false, nullptr}, {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr}, {"slc", AMDGPUOperand::ImmTySLC, true, nullptr}, {"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr}, + {"high", AMDGPUOperand::ImmTyHigh, true, nullptr}, {"clamp", AMDGPUOperand::ImmTyClampSI, true, nullptr}, {"omod", AMDGPUOperand::ImmTyOModSI, false, ConvertOmodMul}, {"unorm", AMDGPUOperand::ImmTyUNorm, true, nullptr}, @@ -4088,6 +4202,30 @@ OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands) return MatchOperand_NoMatch; } +void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands) { + cvtVOP3P(Inst, Operands); + + int Opc = Inst.getOpcode(); + + int SrcNum; + const int Ops[] = { AMDGPU::OpName::src0, + AMDGPU::OpName::src1, + AMDGPU::OpName::src2 }; + for (SrcNum = 0; + SrcNum < 3 && AMDGPU::getNamedOperandIdx(Opc, Ops[SrcNum]) != -1; + ++SrcNum); + assert(SrcNum > 0); + + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); + + if ((OpSel & (1 << SrcNum)) != 0) { + int ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); + uint32_t ModVal = Inst.getOperand(ModIdx).getImm(); + Inst.getOperand(ModIdx).setImm(ModVal | SISrcMods::DST_OP_SEL); + } +} + static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) { // 1. This operand is input modifiers return Desc.OpInfo[OpNum].OperandType == AMDGPU::OPERAND_INPUT_MODS @@ -4099,6 +4237,45 @@ static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) { && Desc.getOperandConstraint(OpNum + 1, MCOI::OperandConstraint::TIED_TO) == -1; } +void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands) +{ + OptionalImmIndexMap OptionalIdx; + unsigned Opc = Inst.getOpcode(); + + unsigned I = 1; + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { + ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); + } + + for (unsigned E = Operands.size(); I != E; ++I) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + Op.addRegOrImmWithFPInputModsOperands(Inst, 2); + } else if (Op.isInterpSlot() || + Op.isInterpAttr() || + Op.isAttrChan()) { + Inst.addOperand(MCOperand::createImm(Op.Imm.Val)); + } else if (Op.isImmModifier()) { + OptionalIdx[Op.getImmTy()] = I; + } else { + llvm_unreachable("unhandled operand type"); + } + } + + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::high) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyHigh); + } + + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI); + } + + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); + } +} + void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, OptionalImmIndexMap &OptionalIdx) { unsigned Opc = Inst.getOpcode(); @@ -4162,20 +4339,36 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { cvtVOP3(Inst, Operands, OptionalIdx); } -void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) { +void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, + const OperandVector &Operands) { OptionalImmIndexMap OptIdx; + const int Opc = Inst.getOpcode(); + const MCInstrDesc &Desc = MII.get(Opc); + + const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0; cvtVOP3(Inst, Operands, OptIdx); + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1) { + assert(!IsPacked); + Inst.addOperand(Inst.getOperand(0)); + } + // FIXME: This is messy. Parse the modifiers as if it was a normal VOP3 // instruction, and then figure out where to actually put the modifiers - int Opc = Inst.getOpcode(); addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSel); - addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSelHi, -1); + + int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi); + if (OpSelHiIdx != -1) { + int DefaultVal = IsPacked ? -1 : 0; + addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSelHi, + DefaultVal); + } int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo); if (NegLoIdx != -1) { + assert(IsPacked); addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo); addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi); } @@ -4188,13 +4381,16 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) { AMDGPU::OpName::src2_modifiers }; int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); - int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi); unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); - unsigned OpSelHi = Inst.getOperand(OpSelHiIdx).getImm(); + unsigned OpSelHi = 0; unsigned NegLo = 0; unsigned NegHi = 0; + if (OpSelHiIdx != -1) { + OpSelHi = Inst.getOperand(OpSelHiIdx).getImm(); + } + if (NegLoIdx != -1) { int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi); NegLo = Inst.getOperand(NegLoIdx).getImm(); @@ -4323,7 +4519,6 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { if (getLexer().isNot(AsmToken::RBrac)) return MatchOperand_ParseFail; Parser.Lex(); - } else { // sel:%d Parser.Lex(); @@ -4383,6 +4578,11 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); } + // All DPP instructions with at least one source operand have a fake "old" + // source at the beginning that's tied to the dst operand. Handle it here. + if (Desc.getNumOperands() >= 2) + Inst.addOperand(Inst.getOperand(0)); + for (unsigned E = Operands.size(); I != E; ++I) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); // Add the register arguments @@ -4405,16 +4605,6 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf); addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl); - - // special case v_mac_{f16, f32}: - // it has src2 register operand that is tied to dst operand - if (Inst.getOpcode() == AMDGPU::V_MAC_F32_dpp || - Inst.getOpcode() == AMDGPU::V_MAC_F16_dpp) { - auto it = Inst.begin(); - std::advance( - it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2)); - Inst.insert(it, Inst.getOperand(0)); // src2 = dst - } } //===----------------------------------------------------------------------===// @@ -4503,6 +4693,7 @@ void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) { void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, uint64_t BasicInstType, bool skipVcc) { using namespace llvm::AMDGPU::SDWA; + OptionalImmIndexMap OptionalIdx; bool skippedVcc = false; diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td index 2e96c14eaa32..2230457b3a9b 100644 --- a/lib/Target/AMDGPU/BUFInstructions.td +++ b/lib/Target/AMDGPU/BUFInstructions.td @@ -11,8 +11,8 @@ def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">; def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">; def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">; -def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantRoot]>; -def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantRoot], 20>; +def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>; +def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>; def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">; @@ -425,45 +425,51 @@ class MUBUF_SetupAddr<int addrKind> { class MUBUF_Load_Pseudo <string opName, int addrKind, RegisterClass vdataClass, + bit HasTiedDest = 0, list<dag> pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind> : MUBUF_Pseudo<opName, (outs vdataClass:$vdata), - getMUBUFIns<addrKindCopy>.ret, + !con(getMUBUFIns<addrKindCopy>.ret, !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))), " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe", pattern>, MUBUF_SetupAddr<addrKindCopy> { let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; + let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", ""); let mayLoad = 1; let mayStore = 0; + let maybeAtomic = 1; } // FIXME: tfe can't be an operand because it requires a separate // opcode because it needs an N+1 register class dest register. multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass, ValueType load_vt = i32, - SDPatternOperator ld = null_frag> { + SDPatternOperator ld = null_frag, + bit TiedDest = 0> { def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, + TiedDest, [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>, MUBUFAddr64Table<0>; def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, + TiedDest, [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>, MUBUFAddr64Table<1>; - def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest>; + def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest>; + def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest>; let DisableWQM = 1 in { - def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>; - def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest>; + def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest>; + def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest>; + def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest>; } } @@ -483,6 +489,7 @@ class MUBUF_Store_Pseudo <string opName, let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret; let mayLoad = 0; let mayStore = 1; + let maybeAtomic = 1; } multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass, @@ -566,6 +573,7 @@ class MUBUF_Atomic_Pseudo<string opName, let DisableWQM = 1; let has_glc = 0; let has_tfe = 0; + let maybeAtomic = 1; } class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind, @@ -617,21 +625,21 @@ multiclass MUBUF_Pseudo_Atomics <string opName, def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; - def _RTN_OFFSET : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, + def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, [(set vdataType:$vdata, (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc), vdataType:$vdata_in))]>, MUBUFAddr64Table <0, "_RTN">; - def _RTN_ADDR64 : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, + def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, [(set vdataType:$vdata, (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc), vdataType:$vdata_in))]>, MUBUFAddr64Table <1, "_RTN">; - def _RTN_OFFEN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; - def _RTN_IDXEN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; - def _RTN_BOTHEN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; + def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>; + def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>; + def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>; } @@ -639,8 +647,6 @@ multiclass MUBUF_Pseudo_Atomics <string opName, // MUBUF Instructions //===----------------------------------------------------------------------===// -let SubtargetPredicate = isGCN in { - defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads < "buffer_load_format_x", VGPR_32 >; @@ -696,16 +702,16 @@ defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores < "buffer_store_short", VGPR_32, i32, truncstorei16_global >; defm BUFFER_STORE_DWORD : MUBUF_Pseudo_Stores < - "buffer_store_dword", VGPR_32, i32, global_store + "buffer_store_dword", VGPR_32, i32, store_global >; defm BUFFER_STORE_DWORDX2 : MUBUF_Pseudo_Stores < - "buffer_store_dwordx2", VReg_64, v2i32, global_store + "buffer_store_dwordx2", VReg_64, v2i32, store_global >; defm BUFFER_STORE_DWORDX3 : MUBUF_Pseudo_Stores < - "buffer_store_dwordx3", VReg_96, untyped, global_store + "buffer_store_dwordx3", VReg_96, untyped, store_global >; defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores < - "buffer_store_dwordx4", VReg_128, v4i32, global_store + "buffer_store_dwordx4", VReg_128, v4i32, store_global >; defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics < "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global @@ -802,6 +808,42 @@ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc", int_amdgcn_buffer_wbinvl1_sc>; } +let SubtargetPredicate = HasD16LoadStore in { + +defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads < + "buffer_load_ubyte_d16", VGPR_32, i32, null_frag, 1 +>; + +defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Pseudo_Loads < + "buffer_load_ubyte_d16_hi", VGPR_32, i32, null_frag, 1 +>; + +defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads < + "buffer_load_sbyte_d16", VGPR_32, i32, null_frag, 1 +>; + +defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Pseudo_Loads < + "buffer_load_sbyte_d16_hi", VGPR_32, i32, null_frag, 1 +>; + +defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads < + "buffer_load_short_d16", VGPR_32, i32, null_frag, 1 +>; + +defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Pseudo_Loads < + "buffer_load_short_d16_hi", VGPR_32, i32, null_frag, 1 +>; + +defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Pseudo_Stores < + "buffer_store_byte_d16_hi", VGPR_32, i32 +>; + +defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores < + "buffer_store_short_d16_hi", VGPR_32, i32 +>; + +} // End HasD16LoadStore + def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1", int_amdgcn_buffer_wbinvl1>; @@ -818,8 +860,6 @@ defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_128>; defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>; -} // End let SubtargetPredicate = isGCN - let SubtargetPredicate = isCIVI in { //===----------------------------------------------------------------------===// @@ -838,22 +878,13 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol", // MUBUF Patterns //===----------------------------------------------------------------------===// -let Predicates = [isGCN] in { - -// Offset in an 32-bit VGPR -def : Pat < - (SIload_constant v4i32:$sbase, i32:$voff), - (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, (i32 0), 0, 0, 0, 0) ->; - - //===----------------------------------------------------------------------===// // buffer_load/store_format patterns //===----------------------------------------------------------------------===// multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode> { - def : Pat< + def : GCNPat< (vt (name v4i32:$rsrc, 0, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$glc, imm:$slc)), @@ -861,7 +892,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, (as_i1imm $glc), (as_i1imm $slc), 0) >; - def : Pat< + def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$glc, imm:$slc)), @@ -869,7 +900,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, (as_i1imm $glc), (as_i1imm $slc), 0) >; - def : Pat< + def : GCNPat< (vt (name v4i32:$rsrc, 0, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$glc, imm:$slc)), @@ -877,7 +908,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, (as_i1imm $glc), (as_i1imm $slc), 0) >; - def : Pat< + def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$glc, imm:$slc)), @@ -897,7 +928,7 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">; multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode> { - def : Pat< + def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$glc, imm:$slc), @@ -905,7 +936,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (as_i1imm $glc), (as_i1imm $slc), 0) >; - def : Pat< + def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$glc, imm:$slc), @@ -914,7 +945,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (as_i1imm $slc), 0) >; - def : Pat< + def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$glc, imm:$slc), @@ -923,7 +954,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (as_i1imm $slc), 0) >; - def : Pat< + def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$glc, imm:$slc), @@ -935,107 +966,107 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, >; } -defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, f32, "BUFFER_STORE_FORMAT_X">; -defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">; -defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">; -defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, f32, "BUFFER_STORE_DWORD">; -defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v2f32, "BUFFER_STORE_DWORDX2">; -defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v4f32, "BUFFER_STORE_DWORDX4">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_X">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">; +defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">; //===----------------------------------------------------------------------===// // buffer_atomic patterns //===----------------------------------------------------------------------===// multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> { - def : Pat< + def : GCNPat< (name i32:$vdata_in, v4i32:$rsrc, 0, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$slc), - (!cast<MUBUF_Pseudo>(opcode # _RTN_OFFSET) $vdata_in, $rsrc, $soffset, + (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)) >; - def : Pat< + def : GCNPat< (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$slc), - (!cast<MUBUF_Pseudo>(opcode # _RTN_IDXEN) $vdata_in, $vindex, $rsrc, $soffset, + (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)) >; - def : Pat< + def : GCNPat< (name i32:$vdata_in, v4i32:$rsrc, 0, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$slc), - (!cast<MUBUF_Pseudo>(opcode # _RTN_OFFEN) $vdata_in, $voffset, $rsrc, $soffset, + (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)) >; - def : Pat< + def : GCNPat< (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$slc), - (!cast<MUBUF_Pseudo>(opcode # _RTN_BOTHEN) + (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN) $vdata_in, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)) >; } -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_swap, "BUFFER_ATOMIC_SWAP">; -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_add, "BUFFER_ATOMIC_ADD">; -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_sub, "BUFFER_ATOMIC_SUB">; -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smin, "BUFFER_ATOMIC_SMIN">; -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umin, "BUFFER_ATOMIC_UMIN">; -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smax, "BUFFER_ATOMIC_SMAX">; -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umax, "BUFFER_ATOMIC_UMAX">; -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_and, "BUFFER_ATOMIC_AND">; -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_or, "BUFFER_ATOMIC_OR">; -defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_xor, "BUFFER_ATOMIC_XOR">; - -def : Pat< - (int_amdgcn_buffer_atomic_cmpswap +defm : BufferAtomicPatterns<SIbuffer_atomic_swap, "BUFFER_ATOMIC_SWAP">; +defm : BufferAtomicPatterns<SIbuffer_atomic_add, "BUFFER_ATOMIC_ADD">; +defm : BufferAtomicPatterns<SIbuffer_atomic_sub, "BUFFER_ATOMIC_SUB">; +defm : BufferAtomicPatterns<SIbuffer_atomic_smin, "BUFFER_ATOMIC_SMIN">; +defm : BufferAtomicPatterns<SIbuffer_atomic_umin, "BUFFER_ATOMIC_UMIN">; +defm : BufferAtomicPatterns<SIbuffer_atomic_smax, "BUFFER_ATOMIC_SMAX">; +defm : BufferAtomicPatterns<SIbuffer_atomic_umax, "BUFFER_ATOMIC_UMAX">; +defm : BufferAtomicPatterns<SIbuffer_atomic_and, "BUFFER_ATOMIC_AND">; +defm : BufferAtomicPatterns<SIbuffer_atomic_or, "BUFFER_ATOMIC_OR">; +defm : BufferAtomicPatterns<SIbuffer_atomic_xor, "BUFFER_ATOMIC_XOR">; + +def : GCNPat< + (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, 0, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$slc), (EXTRACT_SUBREG - (BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET + (BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), sub0) >; -def : Pat< - (int_amdgcn_buffer_atomic_cmpswap +def : GCNPat< + (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), imm:$slc), (EXTRACT_SUBREG - (BUFFER_ATOMIC_CMPSWAP_RTN_IDXEN + (BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), sub0) >; -def : Pat< - (int_amdgcn_buffer_atomic_cmpswap +def : GCNPat< + (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, 0, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$slc), (EXTRACT_SUBREG - (BUFFER_ATOMIC_CMPSWAP_RTN_OFFEN + (BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), sub0) >; -def : Pat< - (int_amdgcn_buffer_atomic_cmpswap +def : GCNPat< + (SIbuffer_atomic_cmpswap i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), imm:$slc), (EXTRACT_SUBREG - (BUFFER_ATOMIC_CMPSWAP_RTN_BOTHEN + (BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), @@ -1044,7 +1075,7 @@ def : Pat< class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt, - PatFrag constant_ld> : Pat < + PatFrag constant_ld> : GCNPat < (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe) @@ -1052,19 +1083,19 @@ class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt, multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET, ValueType vt, PatFrag atomic_ld> { - def : Pat < + def : GCNPat < (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0) + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0) >; - def : Pat < + def : GCNPat < (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))), - (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0) + (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0) >; } -let Predicates = [isSICI] in { +let SubtargetPredicate = isSICI in { def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>; def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>; def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>; @@ -1072,52 +1103,123 @@ def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_con defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, mubuf_load_atomic>; defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, mubuf_load_atomic>; -} // End Predicates = [isSICI] +} // End SubtargetPredicate = isSICI multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, PatFrag ld> { - def : Pat < + def : GCNPat < (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe) >; } -let Predicates = [Has16BitInsts] in { +let OtherPredicates = [Has16BitInsts] in { defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, az_extloadi8_constant>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, mubuf_sextloadi8>; defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, mubuf_az_extloadi8>; -} // End Predicates = [Has16BitInsts] +defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_OFFSET, i16, mubuf_load>; + +} // End OtherPredicates = [Has16BitInsts] multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen, MUBUF_Pseudo InstrOffset, ValueType vt, PatFrag ld> { - def : Pat < + def : GCNPat < (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset))), (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) >; - def : Pat < + def : GCNPat < (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))), (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0) >; } +// XXX - Is it possible to have a complex pattern in a PatFrag? +multiclass MUBUFScratchLoadPat_Hi16 <MUBUF_Pseudo InstrOffen, + MUBUF_Pseudo InstrOffset, + ValueType vt, PatFrag ld> { + def : GCNPat < + (build_vector vt:$lo, (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset)))), + (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo)) + >; + + def : GCNPat < + (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset)))))), + (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $lo)) + >; + + + def : GCNPat < + (build_vector vt:$lo, (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))), + (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo)) + >; + + def : GCNPat < + (build_vector f16:$lo, (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)))))), + (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $lo)) + >; +} + +multiclass MUBUFScratchLoadPat_Lo16 <MUBUF_Pseudo InstrOffen, + MUBUF_Pseudo InstrOffset, + ValueType vt, PatFrag ld> { + def : GCNPat < + (build_vector (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset))), + (vt (Hi16Elt vt:$hi))), + (v2i16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi)) + >; + + def : GCNPat < + (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, + i32:$soffset, u16imm:$offset))))), + (f16 (Hi16Elt f16:$hi))), + (v2f16 (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $hi)) + >; + + def : GCNPat < + (build_vector (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))), + (vt (Hi16Elt vt:$hi))), + (v2i16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi)) + >; + + def : GCNPat < + (build_vector (f16 (bitconvert (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))))), + (f16 (Hi16Elt f16:$hi))), + (v2f16 (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $hi)) + >; +} + defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i32, sextloadi8_private>; -defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, extloadi8_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, az_extloadi8_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_private>; -defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, az_extloadi8_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, BUFFER_LOAD_SSHORT_OFFSET, i32, sextloadi16_private>; -defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, extloadi16_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, az_extloadi16_private>; +defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i16, load_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i32, load_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>; defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>; +let OtherPredicates = [HasD16LoadStore] in { +defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, i16, load_private>; +defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, i16, az_extloadi8_private>; +defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, i16, sextloadi8_private>; + +defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, i16, load_private>; +defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, i16, az_extloadi8_private>; +defm : MUBUFScratchLoadPat_Lo16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, i16, sextloadi8_private>; +} + // BUFFER_LOAD_DWORD*, addr64=0 multiclass MUBUF_Load_Dword <ValueType vt, MUBUF_Pseudo offset, @@ -1125,7 +1227,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF_Pseudo idxen, MUBUF_Pseudo bothen> { - def : Pat < + def : GCNPat < (vt (int_SI_buffer_load_dword v4i32:$rsrc, (i32 imm), i32:$soffset, imm:$offset, 0, 0, imm:$glc, imm:$slc, imm:$tfe)), @@ -1133,7 +1235,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, (as_i1imm $slc), (as_i1imm $tfe)) >; - def : Pat < + def : GCNPat < (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, imm:$offset, 1, 0, imm:$glc, imm:$slc, imm:$tfe)), @@ -1141,7 +1243,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, (as_i1imm $tfe)) >; - def : Pat < + def : GCNPat < (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset, imm:$offset, 0, 1, imm:$glc, imm:$slc, imm:$tfe)), @@ -1149,7 +1251,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, (as_i1imm $slc), (as_i1imm $tfe)) >; - def : Pat < + def : GCNPat < (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset, imm:$offset, 1, 1, imm:$glc, imm:$slc, imm:$tfe)), @@ -1168,27 +1270,27 @@ defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_ multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET, ValueType vt, PatFrag atomic_st> { // Store follows atomic op convention so address is forst - def : Pat < + def : GCNPat < (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc), vt:$val), - (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0) + (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0) >; - def : Pat < + def : GCNPat < (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), - (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0) + (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0) >; } -let Predicates = [isSICI] in { -defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, global_store_atomic>; -defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, global_store_atomic>; -} // End Predicates = [isSICI] +let SubtargetPredicate = isSICI in { +defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, store_atomic_global>; +defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, store_atomic_global>; +} // End Predicates = isSICI multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, PatFrag st> { - def : Pat < + def : GCNPat < (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)), (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe) @@ -1196,18 +1298,18 @@ multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt, } defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_global>; -defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, global_store>; +defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, store_global>; multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen, MUBUF_Pseudo InstrOffset, ValueType vt, PatFrag st> { - def : Pat < + def : GCNPat < (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset)), (InstrOffen $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) >; - def : Pat < + def : GCNPat < (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)), (InstrOffset $value, $srsrc, $soffset, $offset, 0, 0, 0) @@ -1222,6 +1324,16 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private>; defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private>; + +let OtherPredicates = [HasD16LoadStore] in { + // Hiding the extract high pattern in the PatFrag seems to not + // automatically increase the complexity. +let AddedComplexity = 1 in { +defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_D16_HI_OFFEN, BUFFER_STORE_SHORT_D16_HI_OFFSET, i32, store_hi16_private>; +defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_D16_HI_OFFEN, BUFFER_STORE_BYTE_D16_HI_OFFSET, i32, truncstorei8_hi16_private>; +} +} + //===----------------------------------------------------------------------===// // MTBUF Patterns //===----------------------------------------------------------------------===// @@ -1232,28 +1344,28 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OF multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode> { - def : Pat< + def : GCNPat< (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) >; - def : Pat< + def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) >; - def : Pat< + def : GCNPat< (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0) >; - def : Pat< + def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)), (!cast<MTBUF_Pseudo>(opcode # _BOTHEN) @@ -1272,7 +1384,7 @@ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW"> multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, string opcode> { - def : Pat< + def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, @@ -1281,7 +1393,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (as_i1imm $slc), 0) >; - def : Pat< + def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, @@ -1290,7 +1402,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (as_i1imm $slc), 0) >; - def : Pat< + def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, @@ -1299,7 +1411,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, (as_i1imm $slc), 0) >; - def : Pat< + def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc), (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact) @@ -1319,8 +1431,6 @@ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY" defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">; defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">; -} // End let Predicates = [isGCN] - //===----------------------------------------------------------------------===// // Target instructions, move to the appropriate target TD file //===----------------------------------------------------------------------===// @@ -1361,11 +1471,11 @@ multiclass MUBUF_Real_AllAddr_si<bits<7> op> { } multiclass MUBUF_Real_Atomic_si<bits<7> op> : MUBUF_Real_AllAddr_si<op> { - def _RTN_OFFSET_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFSET")>; - def _RTN_ADDR64_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_ADDR64")>; - def _RTN_OFFEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFEN")>; - def _RTN_IDXEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_IDXEN")>; - def _RTN_BOTHEN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_BOTHEN")>; + def _OFFSET_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>; + def _ADDR64_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_ADDR64_RTN")>; + def _OFFEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>; + def _IDXEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>; + def _BOTHEN_RTN_si : MUBUF_Real_si <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>; } defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_si <0x00>; @@ -1520,10 +1630,10 @@ multiclass MUBUF_Real_AllAddr_vi<bits<7> op> { multiclass MUBUF_Real_Atomic_vi<bits<7> op> : MUBUF_Real_AllAddr_vi<op> { - def _RTN_OFFSET_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFSET")>; - def _RTN_OFFEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_OFFEN")>; - def _RTN_IDXEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_IDXEN")>; - def _RTN_BOTHEN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_RTN_BOTHEN")>; + def _OFFSET_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>; + def _OFFEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_OFFEN_RTN")>; + def _IDXEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_IDXEN_RTN")>; + def _BOTHEN_RTN_vi : MUBUF_Real_vi <op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>; } defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_vi <0x00>; @@ -1543,12 +1653,21 @@ defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_vi <0x15>; defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_vi <0x16>; defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_vi <0x17>; defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_vi <0x18>; +defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_vi <0x19>; defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_vi <0x1a>; +defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Real_AllAddr_vi <0x1b>; defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_vi <0x1c>; defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_vi <0x1d>; defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_vi <0x1e>; defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_vi <0x1f>; +defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Real_AllAddr_vi <0x20>; +defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Real_AllAddr_vi <0x21>; +defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Real_AllAddr_vi <0x22>; +defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Real_AllAddr_vi <0x23>; +defm BUFFER_LOAD_SHORT_D16 : MUBUF_Real_AllAddr_vi <0x24>; +defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Real_AllAddr_vi <0x25>; + defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_vi <0x40>; defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_vi <0x41>; defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomic_vi <0x42>; diff --git a/lib/Target/AMDGPU/CIInstructions.td b/lib/Target/AMDGPU/CIInstructions.td deleted file mode 100644 index 26a483a8abf6..000000000000 --- a/lib/Target/AMDGPU/CIInstructions.td +++ /dev/null @@ -1,15 +0,0 @@ -//===-- CIInstructions.td - CI Instruction Defintions ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// Instruction definitions for CI and newer. -//===----------------------------------------------------------------------===// -// Remaining instructions: -// S_CBRANCH_CDBGUSER -// S_CBRANCH_CDBGSYS -// S_CBRANCH_CDBGSYS_OR_USER -// S_CBRANCH_CDBGSYS_AND_USER
\ No newline at end of file diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index 971208c5db84..3a8503030414 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -12,57 +12,52 @@ tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer) tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher) tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler) tablegen(LLVM AMDGPUGenMCPseudoLowering.inc -gen-pseudo-lowering) -if(LLVM_BUILD_GLOBAL_ISEL) - tablegen(LLVM AMDGPUGenRegisterBank.inc -gen-register-bank) -endif() +tablegen(LLVM AMDGPUGenRegisterBank.inc -gen-register-bank) add_public_tablegen_target(AMDGPUCommonTableGen) -# List of all GlobalISel files. -set(GLOBAL_ISEL_FILES - AMDGPUCallLowering.cpp - AMDGPUInstructionSelector.cpp - AMDGPULegalizerInfo.cpp - AMDGPURegisterBankInfo.cpp - ) - -# Add GlobalISel files to the dependencies if the user wants to build it. -if(LLVM_BUILD_GLOBAL_ISEL) - set(GLOBAL_ISEL_BUILD_FILES ${GLOBAL_ISEL_FILES}) -else() - set(GLOBAL_ISEL_BUILD_FILES"") - set(LLVM_OPTIONAL_SOURCES LLVMGlobalISel ${GLOBAL_ISEL_FILES}) -endif() - - add_llvm_target(AMDGPUCodeGen - AMDILCFGStructurizer.cpp AMDGPUAliasAnalysis.cpp AMDGPUAlwaysInlinePass.cpp AMDGPUAnnotateKernelFeatures.cpp AMDGPUAnnotateUniformValues.cpp + AMDGPUArgumentUsageInfo.cpp AMDGPUAsmPrinter.cpp + AMDGPUCallLowering.cpp AMDGPUCodeGenPrepare.cpp AMDGPUFrameLowering.cpp - AMDGPUTargetObjectFile.cpp + AMDGPUInstrInfo.cpp + AMDGPUInstructionSelector.cpp AMDGPUIntrinsicInfo.cpp AMDGPUISelDAGToDAG.cpp + AMDGPUISelLowering.cpp + AMDGPULegalizerInfo.cpp + AMDGPULibCalls.cpp + AMDGPULibFunc.cpp AMDGPULowerIntrinsics.cpp - AMDGPUMacroFusion.cpp - AMDGPUMCInstLower.cpp AMDGPUMachineCFGStructurizer.cpp AMDGPUMachineFunction.cpp - AMDGPUUnifyMetadata.cpp + AMDGPUMachineModuleInfo.cpp + AMDGPUMacroFusion.cpp + AMDGPUMCInstLower.cpp + AMDGPUOpenCLEnqueuedBlockLowering.cpp AMDGPUOpenCLImageTypeLoweringPass.cpp - AMDGPUSubtarget.cpp - AMDGPUTargetMachine.cpp - AMDGPUTargetTransformInfo.cpp - AMDGPUISelLowering.cpp - AMDGPUInstrInfo.cpp AMDGPUPromoteAlloca.cpp AMDGPURegAsmNames.inc.cpp + AMDGPURegisterBankInfo.cpp AMDGPURegisterInfo.cpp + AMDGPURewriteOutArguments.cpp + AMDGPUSubtarget.cpp + AMDGPUTargetMachine.cpp + AMDGPUTargetObjectFile.cpp + AMDGPUTargetTransformInfo.cpp AMDGPUUnifyDivergentExitNodes.cpp + AMDGPUUnifyMetadata.cpp + AMDGPUInline.cpp + AMDILCFGStructurizer.cpp GCNHazardRecognizer.cpp + GCNIterativeScheduler.cpp + GCNMinRegStrategy.cpp + GCNRegPressure.cpp GCNSchedStrategy.cpp R600ClauseMergePass.cpp R600ControlFlowFinalizer.cpp @@ -78,14 +73,14 @@ add_llvm_target(AMDGPUCodeGen R600RegisterInfo.cpp SIAnnotateControlFlow.cpp SIDebuggerInsertNops.cpp - SIFixControlFlowLiveIntervals.cpp SIFixSGPRCopies.cpp SIFixVGPRCopies.cpp + SIFixWWMLiveness.cpp SIFoldOperands.cpp SIFrameLowering.cpp SIInsertSkips.cpp - SIInsertWaits.cpp SIInsertWaitcnts.cpp + SIInsertWaits.cpp SIInstrInfo.cpp SIISelLowering.cpp SILoadStoreOptimizer.cpp @@ -93,15 +88,14 @@ add_llvm_target(AMDGPUCodeGen SILowerI1Copies.cpp SIMachineFunctionInfo.cpp SIMachineScheduler.cpp + SIMemoryLegalizer.cpp SIOptimizeExecMasking.cpp + SIOptimizeExecMaskingPreRA.cpp SIPeepholeSDWA.cpp SIRegisterInfo.cpp SIShrinkInstructions.cpp SIWholeQuadMode.cpp - GCNIterativeScheduler.cpp - GCNMinRegStrategy.cpp - GCNRegPressure.cpp - ${GLOBAL_ISEL_BUILD_FILES} + GCNILPSched.cpp ) add_subdirectory(AsmParser) diff --git a/lib/Target/AMDGPU/CaymanInstructions.td b/lib/Target/AMDGPU/CaymanInstructions.td index 6b8e85a73c73..ae40c6387982 100644 --- a/lib/Target/AMDGPU/CaymanInstructions.td +++ b/lib/Target/AMDGPU/CaymanInstructions.td @@ -18,7 +18,7 @@ def isCayman : Predicate<"Subtarget->hasCaymanISA()">; // Cayman Instructions //===----------------------------------------------------------------------===// -let Predicates = [isCayman] in { +let SubtargetPredicate = isCayman in { def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24", [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))], VecALU @@ -57,26 +57,27 @@ defm DIV_cm : DIV_Common<RECIP_IEEE_cm>; // RECIP_UINT emulation for Cayman // The multiplication scales from [0,1] to the unsigned integer range -def : Pat < +def : R600Pat < (AMDGPUurecip i32:$src0), (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)), (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1))) >; - def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> { +def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> { let ADDR = 0; let POP_COUNT = 0; let COUNT = 0; } -def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>; + +def : R600Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>; class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> : CF_MEM_RAT_CACHELESS <0x14, 0, mask, (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr), "STORE_DWORD $rw_gpr, $index_gpr", - [(global_store vt:$rw_gpr, i32:$index_gpr)]> { + [(store_global vt:$rw_gpr, i32:$index_gpr)]> { let eop = 0; // This bit is not used on Cayman. } @@ -143,8 +144,8 @@ def VTX_READ_32_cm // to be caused by ALU instructions in the next instruction group that wrote // to the $src_gpr registers of the VTX_READ. // e.g. - // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24 - // %T2_X<def> = MOV %ZERO + // %t3_x = VTX_READ_PARAM_32_eg killed %t2_x, 24 + // %t2_x = MOV %zero //Adding this constraint prevents this from happening. let Constraints = "$src_gpr.ptr = $dst_gpr"; } @@ -179,44 +180,43 @@ def VTX_READ_128_cm //===----------------------------------------------------------------------===// // VTX Read from parameter memory space //===----------------------------------------------------------------------===// -def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi8 ADDRVTX_READ:$src_gpr)), +def : R600Pat<(i32:$dst_gpr (vtx_id3_az_extloadi8 ADDRVTX_READ:$src_gpr)), (VTX_READ_8_cm MEMxi:$src_gpr, 3)>; -def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi16 ADDRVTX_READ:$src_gpr)), +def : R600Pat<(i32:$dst_gpr (vtx_id3_az_extloadi16 ADDRVTX_READ:$src_gpr)), (VTX_READ_16_cm MEMxi:$src_gpr, 3)>; -def : Pat<(i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)), +def : R600Pat<(i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)), (VTX_READ_32_cm MEMxi:$src_gpr, 3)>; -def : Pat<(v2i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)), +def : R600Pat<(v2i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)), (VTX_READ_64_cm MEMxi:$src_gpr, 3)>; -def : Pat<(v4i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)), +def : R600Pat<(v4i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)), (VTX_READ_128_cm MEMxi:$src_gpr, 3)>; //===----------------------------------------------------------------------===// // VTX Read from constant memory space //===----------------------------------------------------------------------===// -def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr)), +def : R600Pat<(i32:$dst_gpr (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr)), (VTX_READ_8_cm MEMxi:$src_gpr, 2)>; -def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr)), +def : R600Pat<(i32:$dst_gpr (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr)), (VTX_READ_16_cm MEMxi:$src_gpr, 2)>; -def : Pat<(i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)), +def : R600Pat<(i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)), (VTX_READ_32_cm MEMxi:$src_gpr, 2)>; -def : Pat<(v2i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)), +def : R600Pat<(v2i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)), (VTX_READ_64_cm MEMxi:$src_gpr, 2)>; -def : Pat<(v4i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)), +def : R600Pat<(v4i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)), (VTX_READ_128_cm MEMxi:$src_gpr, 2)>; //===----------------------------------------------------------------------===// // VTX Read from global memory space //===----------------------------------------------------------------------===// -def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr)), +def : R600Pat<(i32:$dst_gpr (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr)), (VTX_READ_8_cm MEMxi:$src_gpr, 1)>; -def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr)), +def : R600Pat<(i32:$dst_gpr (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr)), (VTX_READ_16_cm MEMxi:$src_gpr, 1)>; -def : Pat<(i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), +def : R600Pat<(i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), (VTX_READ_32_cm MEMxi:$src_gpr, 1)>; -def : Pat<(v2i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), +def : R600Pat<(v2i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), (VTX_READ_64_cm MEMxi:$src_gpr, 1)>; -def : Pat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), +def : R600Pat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), (VTX_READ_128_cm MEMxi:$src_gpr, 1)>; -} // End isCayman - +} // End let SubtargetPredicate = isCayman diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td index fc516c3b39c2..f898fd7948cc 100644 --- a/lib/Target/AMDGPU/DSInstructions.td +++ b/lib/Target/AMDGPU/DSInstructions.td @@ -17,7 +17,6 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt let DS = 1; let Size = 8; let UseNamedOperandTable = 1; - let Uses = [M0, EXEC]; // Most instruction load and store data, so set this as the default. let mayLoad = 1; @@ -47,6 +46,10 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt bits<1> has_gds = 1; bits<1> gdsValue = 0; // if has_gds == 0 set gds to this value + + bits<1> has_m0_read = 1; + + let Uses = !if(has_m0_read, [M0, EXEC], [EXEC]); } class DS_Real <DS_Pseudo ds> : @@ -81,23 +84,41 @@ class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32> : DS_Pseudo<opName, (outs), (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds), - "$addr, $data0$offset$gds">, - AtomicNoRet<opName, 0> { + "$addr, $data0$offset$gds"> { let has_data1 = 0; let has_vdst = 0; } +multiclass DS_1A1D_NORET_mc<string opName, RegisterClass rc = VGPR_32> { + def "" : DS_1A1D_NORET<opName, rc>, + AtomicNoRet<opName, 0>; + + let has_m0_read = 0 in { + def _gfx9 : DS_1A1D_NORET<opName, rc>, + AtomicNoRet<opName#"_gfx9", 0>; + } +} + class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32> : DS_Pseudo<opName, (outs), (ins VGPR_32:$addr, rc:$data0, rc:$data1, offset:$offset, gds:$gds), - "$addr, $data0, $data1"#"$offset"#"$gds">, - AtomicNoRet<opName, 0> { + "$addr, $data0, $data1"#"$offset"#"$gds"> { let has_vdst = 0; } +multiclass DS_1A2D_NORET_mc<string opName, RegisterClass rc = VGPR_32> { + def "" : DS_1A2D_NORET<opName, rc>, + AtomicNoRet<opName, 0>; + + let has_m0_read = 0 in { + def _gfx9 : DS_1A2D_NORET<opName, rc>, + AtomicNoRet<opName#"_gfx9", 0>; + } +} + class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32> : DS_Pseudo<opName, (outs), @@ -110,6 +131,14 @@ class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32> let AsmMatchConverter = "cvtDSOffset01"; } +multiclass DS_1A2D_Off8_NORET_mc <string opName, RegisterClass rc = VGPR_32> { + def "" : DS_1A2D_Off8_NORET<opName, rc>; + + let has_m0_read = 0 in { + def _gfx9 : DS_1A2D_Off8_NORET<opName, rc>; + } +} + class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32> : DS_Pseudo<opName, (outs rc:$vdst), @@ -120,6 +149,18 @@ class DS_1A1D_RET <string opName, RegisterClass rc = VGPR_32> let has_data1 = 0; } +multiclass DS_1A1D_RET_mc <string opName, RegisterClass rc = VGPR_32, + string NoRetOp = ""> { + def "" : DS_1A1D_RET<opName, rc>, + AtomicNoRet<NoRetOp, !if(!eq(NoRetOp, ""), 0, 1)>; + + let has_m0_read = 0 in { + def _gfx9 : DS_1A1D_RET<opName, rc>, + AtomicNoRet<!if(!eq(NoRetOp, ""), "", NoRetOp#"_gfx9"), + !if(!eq(NoRetOp, ""), 0, 1)>; + } +} + class DS_1A2D_RET<string opName, RegisterClass rc = VGPR_32, RegisterClass src = rc> @@ -131,6 +172,19 @@ class DS_1A2D_RET<string opName, let hasPostISelHook = 1; } +multiclass DS_1A2D_RET_mc<string opName, + RegisterClass rc = VGPR_32, + string NoRetOp = "", + RegisterClass src = rc> { + def "" : DS_1A2D_RET<opName, rc, src>, + AtomicNoRet<NoRetOp, !if(!eq(NoRetOp, ""), 0, 1)>; + + let has_m0_read = 0 in { + def _gfx9 : DS_1A2D_RET<opName, rc, src>, + AtomicNoRet<NoRetOp#"_gfx9", !if(!eq(NoRetOp, ""), 0, 1)>; + } +} + class DS_1A2D_Off8_RET<string opName, RegisterClass rc = VGPR_32, RegisterClass src = rc> @@ -145,16 +199,41 @@ class DS_1A2D_Off8_RET<string opName, let hasPostISelHook = 1; } -class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, Operand ofs = offset> +multiclass DS_1A2D_Off8_RET_mc<string opName, + RegisterClass rc = VGPR_32, + RegisterClass src = rc> { + def "" : DS_1A2D_Off8_RET<opName, rc, src>; + + let has_m0_read = 0 in { + def _gfx9 : DS_1A2D_Off8_RET<opName, rc, src>; + } +} + + +class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = offset> : DS_Pseudo<opName, (outs rc:$vdst), - (ins VGPR_32:$addr, ofs:$offset, gds:$gds), + !if(HasTiedOutput, + (ins VGPR_32:$addr, ofs:$offset, gds:$gds, rc:$vdst_in), + (ins VGPR_32:$addr, ofs:$offset, gds:$gds)), "$vdst, $addr$offset$gds"> { - + let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", ""); + let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); let has_data0 = 0; let has_data1 = 0; } +multiclass DS_1A_RET_mc<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = offset> { + def "" : DS_1A_RET<opName, rc, HasTiedOutput, ofs>; + + let has_m0_read = 0 in { + def _gfx9 : DS_1A_RET<opName, rc, HasTiedOutput, ofs>; + } +} + +class DS_1A_RET_Tied<string opName, RegisterClass rc = VGPR_32> : + DS_1A_RET<opName, rc, 1>; + class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32> : DS_Pseudo<opName, (outs rc:$vdst), @@ -167,6 +246,14 @@ class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32> let AsmMatchConverter = "cvtDSOffset01"; } +multiclass DS_1A_Off8_RET_mc <string opName, RegisterClass rc = VGPR_32> { + def "" : DS_1A_Off8_RET<opName, rc>; + + let has_m0_read = 0 in { + def _gfx9 : DS_1A_Off8_RET<opName, rc>; + } +} + class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName, (outs VGPR_32:$vdst), (ins VGPR_32:$addr, offset:$offset), @@ -205,6 +292,15 @@ class DS_1A <string opName> : DS_Pseudo<opName, let has_data1 = 0; } +multiclass DS_1A_mc <string opName> { + def "" : DS_1A<opName>; + + let has_m0_read = 0 in { + def _gfx9 : DS_1A<opName>; + } +} + + class DS_GWS <string opName, dag ins, string asmOps> : DS_Pseudo<opName, (outs), ins, asmOps> { @@ -263,142 +359,115 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag> let has_gds = 0; } -def DS_ADD_U32 : DS_1A1D_NORET<"ds_add_u32">; -def DS_SUB_U32 : DS_1A1D_NORET<"ds_sub_u32">; -def DS_RSUB_U32 : DS_1A1D_NORET<"ds_rsub_u32">; -def DS_INC_U32 : DS_1A1D_NORET<"ds_inc_u32">; -def DS_DEC_U32 : DS_1A1D_NORET<"ds_dec_u32">; -def DS_MIN_I32 : DS_1A1D_NORET<"ds_min_i32">; -def DS_MAX_I32 : DS_1A1D_NORET<"ds_max_i32">; -def DS_MIN_U32 : DS_1A1D_NORET<"ds_min_u32">; -def DS_MAX_U32 : DS_1A1D_NORET<"ds_max_u32">; -def DS_AND_B32 : DS_1A1D_NORET<"ds_and_b32">; -def DS_OR_B32 : DS_1A1D_NORET<"ds_or_b32">; -def DS_XOR_B32 : DS_1A1D_NORET<"ds_xor_b32">; -def DS_ADD_F32 : DS_1A1D_NORET<"ds_add_f32">; -def DS_MIN_F32 : DS_1A1D_NORET<"ds_min_f32">; -def DS_MAX_F32 : DS_1A1D_NORET<"ds_max_f32">; +defm DS_ADD_U32 : DS_1A1D_NORET_mc<"ds_add_u32">; +defm DS_SUB_U32 : DS_1A1D_NORET_mc<"ds_sub_u32">; +defm DS_RSUB_U32 : DS_1A1D_NORET_mc<"ds_rsub_u32">; +defm DS_INC_U32 : DS_1A1D_NORET_mc<"ds_inc_u32">; +defm DS_DEC_U32 : DS_1A1D_NORET_mc<"ds_dec_u32">; +defm DS_MIN_I32 : DS_1A1D_NORET_mc<"ds_min_i32">; +defm DS_MAX_I32 : DS_1A1D_NORET_mc<"ds_max_i32">; +defm DS_MIN_U32 : DS_1A1D_NORET_mc<"ds_min_u32">; +defm DS_MAX_U32 : DS_1A1D_NORET_mc<"ds_max_u32">; +defm DS_AND_B32 : DS_1A1D_NORET_mc<"ds_and_b32">; +defm DS_OR_B32 : DS_1A1D_NORET_mc<"ds_or_b32">; +defm DS_XOR_B32 : DS_1A1D_NORET_mc<"ds_xor_b32">; +defm DS_ADD_F32 : DS_1A1D_NORET_mc<"ds_add_f32">; +defm DS_MIN_F32 : DS_1A1D_NORET_mc<"ds_min_f32">; +defm DS_MAX_F32 : DS_1A1D_NORET_mc<"ds_max_f32">; let mayLoad = 0 in { -def DS_WRITE_B8 : DS_1A1D_NORET<"ds_write_b8">; -def DS_WRITE_B16 : DS_1A1D_NORET<"ds_write_b16">; -def DS_WRITE_B32 : DS_1A1D_NORET<"ds_write_b32">; -def DS_WRITE2_B32 : DS_1A2D_Off8_NORET<"ds_write2_b32">; -def DS_WRITE2ST64_B32 : DS_1A2D_Off8_NORET<"ds_write2st64_b32">; -} - -def DS_MSKOR_B32 : DS_1A2D_NORET<"ds_mskor_b32">; -def DS_CMPST_B32 : DS_1A2D_NORET<"ds_cmpst_b32">; -def DS_CMPST_F32 : DS_1A2D_NORET<"ds_cmpst_f32">; - -def DS_ADD_U64 : DS_1A1D_NORET<"ds_add_u64", VReg_64>; -def DS_SUB_U64 : DS_1A1D_NORET<"ds_sub_u64", VReg_64>; -def DS_RSUB_U64 : DS_1A1D_NORET<"ds_rsub_u64", VReg_64>; -def DS_INC_U64 : DS_1A1D_NORET<"ds_inc_u64", VReg_64>; -def DS_DEC_U64 : DS_1A1D_NORET<"ds_dec_u64", VReg_64>; -def DS_MIN_I64 : DS_1A1D_NORET<"ds_min_i64", VReg_64>; -def DS_MAX_I64 : DS_1A1D_NORET<"ds_max_i64", VReg_64>; -def DS_MIN_U64 : DS_1A1D_NORET<"ds_min_u64", VReg_64>; -def DS_MAX_U64 : DS_1A1D_NORET<"ds_max_u64", VReg_64>; -def DS_AND_B64 : DS_1A1D_NORET<"ds_and_b64", VReg_64>; -def DS_OR_B64 : DS_1A1D_NORET<"ds_or_b64", VReg_64>; -def DS_XOR_B64 : DS_1A1D_NORET<"ds_xor_b64", VReg_64>; -def DS_MSKOR_B64 : DS_1A2D_NORET<"ds_mskor_b64", VReg_64>; +defm DS_WRITE_B8 : DS_1A1D_NORET_mc<"ds_write_b8">; +defm DS_WRITE_B16 : DS_1A1D_NORET_mc<"ds_write_b16">; +defm DS_WRITE_B32 : DS_1A1D_NORET_mc<"ds_write_b32">; +defm DS_WRITE2_B32 : DS_1A2D_Off8_NORET_mc<"ds_write2_b32">; +defm DS_WRITE2ST64_B32: DS_1A2D_Off8_NORET_mc<"ds_write2st64_b32">; + + +let has_m0_read = 0 in { + +let SubtargetPredicate = HasD16LoadStore in { +def DS_WRITE_B8_D16_HI : DS_1A1D_NORET<"ds_write_b8_d16_hi">; +def DS_WRITE_B16_D16_HI : DS_1A1D_NORET<"ds_write_b16_d16_hi">; +} + +let SubtargetPredicate = HasDSAddTid in { +def DS_WRITE_ADDTID_B32 : DS_1A1D_NORET<"ds_write_addtid_b32">; +} + +} // End has_m0_read = 0 +} // End mayLoad = 0 + +defm DS_MSKOR_B32 : DS_1A2D_NORET_mc<"ds_mskor_b32">; +defm DS_CMPST_B32 : DS_1A2D_NORET_mc<"ds_cmpst_b32">; +defm DS_CMPST_F32 : DS_1A2D_NORET_mc<"ds_cmpst_f32">; + +defm DS_ADD_U64 : DS_1A1D_NORET_mc<"ds_add_u64", VReg_64>; +defm DS_SUB_U64 : DS_1A1D_NORET_mc<"ds_sub_u64", VReg_64>; +defm DS_RSUB_U64 : DS_1A1D_NORET_mc<"ds_rsub_u64", VReg_64>; +defm DS_INC_U64 : DS_1A1D_NORET_mc<"ds_inc_u64", VReg_64>; +defm DS_DEC_U64 : DS_1A1D_NORET_mc<"ds_dec_u64", VReg_64>; +defm DS_MIN_I64 : DS_1A1D_NORET_mc<"ds_min_i64", VReg_64>; +defm DS_MAX_I64 : DS_1A1D_NORET_mc<"ds_max_i64", VReg_64>; +defm DS_MIN_U64 : DS_1A1D_NORET_mc<"ds_min_u64", VReg_64>; +defm DS_MAX_U64 : DS_1A1D_NORET_mc<"ds_max_u64", VReg_64>; +defm DS_AND_B64 : DS_1A1D_NORET_mc<"ds_and_b64", VReg_64>; +defm DS_OR_B64 : DS_1A1D_NORET_mc<"ds_or_b64", VReg_64>; +defm DS_XOR_B64 : DS_1A1D_NORET_mc<"ds_xor_b64", VReg_64>; +defm DS_MSKOR_B64 : DS_1A2D_NORET_mc<"ds_mskor_b64", VReg_64>; let mayLoad = 0 in { -def DS_WRITE_B64 : DS_1A1D_NORET<"ds_write_b64", VReg_64>; -def DS_WRITE2_B64 : DS_1A2D_Off8_NORET<"ds_write2_b64", VReg_64>; -def DS_WRITE2ST64_B64 : DS_1A2D_Off8_NORET<"ds_write2st64_b64", VReg_64>; -} -def DS_CMPST_B64 : DS_1A2D_NORET<"ds_cmpst_b64", VReg_64>; -def DS_CMPST_F64 : DS_1A2D_NORET<"ds_cmpst_f64", VReg_64>; -def DS_MIN_F64 : DS_1A1D_NORET<"ds_min_f64", VReg_64>; -def DS_MAX_F64 : DS_1A1D_NORET<"ds_max_f64", VReg_64>; - -def DS_ADD_RTN_U32 : DS_1A1D_RET<"ds_add_rtn_u32">, - AtomicNoRet<"ds_add_u32", 1>; -def DS_ADD_RTN_F32 : DS_1A1D_RET<"ds_add_rtn_f32">, - AtomicNoRet<"ds_add_f32", 1>; -def DS_SUB_RTN_U32 : DS_1A1D_RET<"ds_sub_rtn_u32">, - AtomicNoRet<"ds_sub_u32", 1>; -def DS_RSUB_RTN_U32 : DS_1A1D_RET<"ds_rsub_rtn_u32">, - AtomicNoRet<"ds_rsub_u32", 1>; -def DS_INC_RTN_U32 : DS_1A1D_RET<"ds_inc_rtn_u32">, - AtomicNoRet<"ds_inc_u32", 1>; -def DS_DEC_RTN_U32 : DS_1A1D_RET<"ds_dec_rtn_u32">, - AtomicNoRet<"ds_dec_u32", 1>; -def DS_MIN_RTN_I32 : DS_1A1D_RET<"ds_min_rtn_i32">, - AtomicNoRet<"ds_min_i32", 1>; -def DS_MAX_RTN_I32 : DS_1A1D_RET<"ds_max_rtn_i32">, - AtomicNoRet<"ds_max_i32", 1>; -def DS_MIN_RTN_U32 : DS_1A1D_RET<"ds_min_rtn_u32">, - AtomicNoRet<"ds_min_u32", 1>; -def DS_MAX_RTN_U32 : DS_1A1D_RET<"ds_max_rtn_u32">, - AtomicNoRet<"ds_max_u32", 1>; -def DS_AND_RTN_B32 : DS_1A1D_RET<"ds_and_rtn_b32">, - AtomicNoRet<"ds_and_b32", 1>; -def DS_OR_RTN_B32 : DS_1A1D_RET<"ds_or_rtn_b32">, - AtomicNoRet<"ds_or_b32", 1>; -def DS_XOR_RTN_B32 : DS_1A1D_RET<"ds_xor_rtn_b32">, - AtomicNoRet<"ds_xor_b32", 1>; -def DS_MSKOR_RTN_B32 : DS_1A2D_RET<"ds_mskor_rtn_b32">, - AtomicNoRet<"ds_mskor_b32", 1>; -def DS_CMPST_RTN_B32 : DS_1A2D_RET <"ds_cmpst_rtn_b32">, - AtomicNoRet<"ds_cmpst_b32", 1>; -def DS_CMPST_RTN_F32 : DS_1A2D_RET <"ds_cmpst_rtn_f32">, - AtomicNoRet<"ds_cmpst_f32", 1>; -def DS_MIN_RTN_F32 : DS_1A1D_RET <"ds_min_rtn_f32">, - AtomicNoRet<"ds_min_f32", 1>; -def DS_MAX_RTN_F32 : DS_1A1D_RET <"ds_max_rtn_f32">, - AtomicNoRet<"ds_max_f32", 1>; - -def DS_WRXCHG_RTN_B32 : DS_1A1D_RET<"ds_wrxchg_rtn_b32">, - AtomicNoRet<"", 1>; -def DS_WRXCHG2_RTN_B32 : DS_1A2D_Off8_RET<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>, - AtomicNoRet<"", 1>; -def DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_Off8_RET<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>, - AtomicNoRet<"", 1>; - -def DS_ADD_RTN_U64 : DS_1A1D_RET<"ds_add_rtn_u64", VReg_64>, - AtomicNoRet<"ds_add_u64", 1>; -def DS_SUB_RTN_U64 : DS_1A1D_RET<"ds_sub_rtn_u64", VReg_64>, - AtomicNoRet<"ds_sub_u64", 1>; -def DS_RSUB_RTN_U64 : DS_1A1D_RET<"ds_rsub_rtn_u64", VReg_64>, - AtomicNoRet<"ds_rsub_u64", 1>; -def DS_INC_RTN_U64 : DS_1A1D_RET<"ds_inc_rtn_u64", VReg_64>, - AtomicNoRet<"ds_inc_u64", 1>; -def DS_DEC_RTN_U64 : DS_1A1D_RET<"ds_dec_rtn_u64", VReg_64>, - AtomicNoRet<"ds_dec_u64", 1>; -def DS_MIN_RTN_I64 : DS_1A1D_RET<"ds_min_rtn_i64", VReg_64>, - AtomicNoRet<"ds_min_i64", 1>; -def DS_MAX_RTN_I64 : DS_1A1D_RET<"ds_max_rtn_i64", VReg_64>, - AtomicNoRet<"ds_max_i64", 1>; -def DS_MIN_RTN_U64 : DS_1A1D_RET<"ds_min_rtn_u64", VReg_64>, - AtomicNoRet<"ds_min_u64", 1>; -def DS_MAX_RTN_U64 : DS_1A1D_RET<"ds_max_rtn_u64", VReg_64>, - AtomicNoRet<"ds_max_u64", 1>; -def DS_AND_RTN_B64 : DS_1A1D_RET<"ds_and_rtn_b64", VReg_64>, - AtomicNoRet<"ds_and_b64", 1>; -def DS_OR_RTN_B64 : DS_1A1D_RET<"ds_or_rtn_b64", VReg_64>, - AtomicNoRet<"ds_or_b64", 1>; -def DS_XOR_RTN_B64 : DS_1A1D_RET<"ds_xor_rtn_b64", VReg_64>, - AtomicNoRet<"ds_xor_b64", 1>; -def DS_MSKOR_RTN_B64 : DS_1A2D_RET<"ds_mskor_rtn_b64", VReg_64>, - AtomicNoRet<"ds_mskor_b64", 1>; -def DS_CMPST_RTN_B64 : DS_1A2D_RET<"ds_cmpst_rtn_b64", VReg_64>, - AtomicNoRet<"ds_cmpst_b64", 1>; -def DS_CMPST_RTN_F64 : DS_1A2D_RET<"ds_cmpst_rtn_f64", VReg_64>, - AtomicNoRet<"ds_cmpst_f64", 1>; -def DS_MIN_RTN_F64 : DS_1A1D_RET<"ds_min_rtn_f64", VReg_64>, - AtomicNoRet<"ds_min_f64", 1>; -def DS_MAX_RTN_F64 : DS_1A1D_RET<"ds_max_rtn_f64", VReg_64>, - AtomicNoRet<"ds_max_f64", 1>; - -def DS_WRXCHG_RTN_B64 : DS_1A1D_RET<"ds_wrxchg_rtn_b64", VReg_64>, - AtomicNoRet<"", 1>; -def DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>, - AtomicNoRet<"", 1>; -def DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>, - AtomicNoRet<"", 1>; +defm DS_WRITE_B64 : DS_1A1D_NORET_mc<"ds_write_b64", VReg_64>; +defm DS_WRITE2_B64 : DS_1A2D_Off8_NORET_mc<"ds_write2_b64", VReg_64>; +defm DS_WRITE2ST64_B64: DS_1A2D_Off8_NORET_mc<"ds_write2st64_b64", VReg_64>; +} +defm DS_CMPST_B64 : DS_1A2D_NORET_mc<"ds_cmpst_b64", VReg_64>; +defm DS_CMPST_F64 : DS_1A2D_NORET_mc<"ds_cmpst_f64", VReg_64>; +defm DS_MIN_F64 : DS_1A1D_NORET_mc<"ds_min_f64", VReg_64>; +defm DS_MAX_F64 : DS_1A1D_NORET_mc<"ds_max_f64", VReg_64>; + +defm DS_ADD_RTN_U32 : DS_1A1D_RET_mc<"ds_add_rtn_u32", VGPR_32, "ds_add_u32">; +defm DS_ADD_RTN_F32 : DS_1A1D_RET_mc<"ds_add_rtn_f32", VGPR_32, "ds_add_f32">; +defm DS_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">; +defm DS_RSUB_RTN_U32 : DS_1A1D_RET_mc<"ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">; +defm DS_INC_RTN_U32 : DS_1A1D_RET_mc<"ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">; +defm DS_DEC_RTN_U32 : DS_1A1D_RET_mc<"ds_dec_rtn_u32", VGPR_32, "ds_dec_u32">; +defm DS_MIN_RTN_I32 : DS_1A1D_RET_mc<"ds_min_rtn_i32", VGPR_32, "ds_min_i32">; +defm DS_MAX_RTN_I32 : DS_1A1D_RET_mc<"ds_max_rtn_i32", VGPR_32, "ds_max_i32">; +defm DS_MIN_RTN_U32 : DS_1A1D_RET_mc<"ds_min_rtn_u32", VGPR_32, "ds_min_u32">; +defm DS_MAX_RTN_U32 : DS_1A1D_RET_mc<"ds_max_rtn_u32", VGPR_32, "ds_max_u32">; +defm DS_AND_RTN_B32 : DS_1A1D_RET_mc<"ds_and_rtn_b32", VGPR_32, "ds_and_b32">; +defm DS_OR_RTN_B32 : DS_1A1D_RET_mc<"ds_or_rtn_b32", VGPR_32, "ds_or_b32">; +defm DS_XOR_RTN_B32 : DS_1A1D_RET_mc<"ds_xor_rtn_b32", VGPR_32, "ds_xor_b32">; +defm DS_MSKOR_RTN_B32 : DS_1A2D_RET_mc<"ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">; +defm DS_CMPST_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">; +defm DS_CMPST_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; +defm DS_MIN_RTN_F32 : DS_1A1D_RET_mc <"ds_min_rtn_f32", VGPR_32, "ds_min_f32">; +defm DS_MAX_RTN_F32 : DS_1A1D_RET_mc<"ds_max_rtn_f32", VGPR_32, "ds_max_f32">; + +defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b32">; +defm DS_WRXCHG2_RTN_B32 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>; +defm DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>; + +defm DS_ADD_RTN_U64 : DS_1A1D_RET_mc<"ds_add_rtn_u64", VReg_64, "ds_add_u64">; +defm DS_SUB_RTN_U64 : DS_1A1D_RET_mc<"ds_sub_rtn_u64", VReg_64, "ds_sub_u64">; +defm DS_RSUB_RTN_U64 : DS_1A1D_RET_mc<"ds_rsub_rtn_u64", VReg_64, "ds_rsub_u64">; +defm DS_INC_RTN_U64 : DS_1A1D_RET_mc<"ds_inc_rtn_u64", VReg_64, "ds_inc_u64">; +defm DS_DEC_RTN_U64 : DS_1A1D_RET_mc<"ds_dec_rtn_u64", VReg_64, "ds_dec_u64">; +defm DS_MIN_RTN_I64 : DS_1A1D_RET_mc<"ds_min_rtn_i64", VReg_64, "ds_min_i64">; +defm DS_MAX_RTN_I64 : DS_1A1D_RET_mc<"ds_max_rtn_i64", VReg_64, "ds_max_i64">; +defm DS_MIN_RTN_U64 : DS_1A1D_RET_mc<"ds_min_rtn_u64", VReg_64, "ds_min_u64">; +defm DS_MAX_RTN_U64 : DS_1A1D_RET_mc<"ds_max_rtn_u64", VReg_64, "ds_max_u64">; +defm DS_AND_RTN_B64 : DS_1A1D_RET_mc<"ds_and_rtn_b64", VReg_64, "ds_and_b64">; +defm DS_OR_RTN_B64 : DS_1A1D_RET_mc<"ds_or_rtn_b64", VReg_64, "ds_or_b64">; +defm DS_XOR_RTN_B64 : DS_1A1D_RET_mc<"ds_xor_rtn_b64", VReg_64, "ds_xor_b64">; +defm DS_MSKOR_RTN_B64 : DS_1A2D_RET_mc<"ds_mskor_rtn_b64", VReg_64, "ds_mskor_b64">; +defm DS_CMPST_RTN_B64 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b64", VReg_64, "ds_cmpst_b64">; +defm DS_CMPST_RTN_F64 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f64", VReg_64, "ds_cmpst_f64">; +defm DS_MIN_RTN_F64 : DS_1A1D_RET_mc<"ds_min_rtn_f64", VReg_64, "ds_min_f64">; +defm DS_MAX_RTN_F64 : DS_1A1D_RET_mc<"ds_max_rtn_f64", VReg_64, "ds_max_f64">; + +defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b64", VReg_64>; +defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>; +defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>; def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init">; def DS_GWS_SEMA_V : DS_GWS_0D<"ds_gws_sema_v">; @@ -440,22 +509,37 @@ def DS_WRITE_SRC2_B32 : DS_1A<"ds_write_src2_b32">; def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">; let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in { -def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, SwizzleImm>; +def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, SwizzleImm>; } let mayStore = 0 in { -def DS_READ_I8 : DS_1A_RET<"ds_read_i8">; -def DS_READ_U8 : DS_1A_RET<"ds_read_u8">; -def DS_READ_I16 : DS_1A_RET<"ds_read_i16">; -def DS_READ_U16 : DS_1A_RET<"ds_read_u16">; -def DS_READ_B32 : DS_1A_RET<"ds_read_b32">; -def DS_READ_B64 : DS_1A_RET<"ds_read_b64", VReg_64>; - -def DS_READ2_B32 : DS_1A_Off8_RET<"ds_read2_b32", VReg_64>; -def DS_READ2ST64_B32 : DS_1A_Off8_RET<"ds_read2st64_b32", VReg_64>; +defm DS_READ_I8 : DS_1A_RET_mc<"ds_read_i8">; +defm DS_READ_U8 : DS_1A_RET_mc<"ds_read_u8">; +defm DS_READ_I16 : DS_1A_RET_mc<"ds_read_i16">; +defm DS_READ_U16 : DS_1A_RET_mc<"ds_read_u16">; +defm DS_READ_B32 : DS_1A_RET_mc<"ds_read_b32">; +defm DS_READ_B64 : DS_1A_RET_mc<"ds_read_b64", VReg_64>; + +defm DS_READ2_B32 : DS_1A_Off8_RET_mc<"ds_read2_b32", VReg_64>; +defm DS_READ2ST64_B32: DS_1A_Off8_RET_mc<"ds_read2st64_b32", VReg_64>; + +defm DS_READ2_B64 : DS_1A_Off8_RET_mc<"ds_read2_b64", VReg_128>; +defm DS_READ2ST64_B64: DS_1A_Off8_RET_mc<"ds_read2st64_b64", VReg_128>; + +let has_m0_read = 0 in { +let SubtargetPredicate = HasD16LoadStore in { +def DS_READ_U8_D16 : DS_1A_RET_Tied<"ds_read_u8_d16">; +def DS_READ_U8_D16_HI : DS_1A_RET_Tied<"ds_read_u8_d16_hi">; +def DS_READ_I8_D16 : DS_1A_RET_Tied<"ds_read_i8_d16">; +def DS_READ_I8_D16_HI : DS_1A_RET_Tied<"ds_read_i8_d16_hi">; +def DS_READ_U16_D16 : DS_1A_RET_Tied<"ds_read_u16_d16">; +def DS_READ_U16_D16_HI : DS_1A_RET_Tied<"ds_read_u16_d16_hi">; +} -def DS_READ2_B64 : DS_1A_Off8_RET<"ds_read2_b64", VReg_128>; -def DS_READ2ST64_B64 : DS_1A_Off8_RET<"ds_read2st64_b64", VReg_128>; +let SubtargetPredicate = HasDSAddTid in { +def DS_READ_ADDTID_B32 : DS_1A_RET<"ds_read_addtid_b32">; +} +} // End has_m0_read = 0 } def DS_CONSUME : DS_0A_RET<"ds_consume">; @@ -468,21 +552,19 @@ def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">; let SubtargetPredicate = isCIVI in { -def DS_WRAP_RTN_B32 : DS_1A2D_RET<"ds_wrap_rtn_b32">, AtomicNoRet<"", 1>; - -def DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET<"ds_condxchg32_rtn_b64", VReg_64>, - AtomicNoRet<"", 1>; +defm DS_WRAP_RTN_B32 : DS_1A2D_RET_mc<"ds_wrap_rtn_b32", VGPR_32>; +defm DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET_mc<"ds_condxchg32_rtn_b64", VReg_64>; def DS_GWS_SEMA_RELEASE_ALL : DS_GWS_0D<"ds_gws_sema_release_all">; let mayStore = 0 in { -def DS_READ_B96 : DS_1A_RET<"ds_read_b96", VReg_96>; -def DS_READ_B128: DS_1A_RET<"ds_read_b128", VReg_128>; +defm DS_READ_B96 : DS_1A_RET_mc<"ds_read_b96", VReg_96>; +defm DS_READ_B128: DS_1A_RET_mc<"ds_read_b128", VReg_128>; } // End mayStore = 0 let mayLoad = 0 in { -def DS_WRITE_B96 : DS_1A1D_NORET<"ds_write_b96", VReg_96>; -def DS_WRITE_B128 : DS_1A1D_NORET<"ds_write_b128", VReg_128>; +defm DS_WRITE_B96 : DS_1A1D_NORET_mc<"ds_write_b96", VReg_96>; +defm DS_WRITE_B128 : DS_1A1D_NORET_mc<"ds_write_b128", VReg_128>; } // End mayLoad = 0 def DS_NOP : DS_VOID<"ds_nop">; @@ -508,107 +590,201 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32", // DS Patterns //===----------------------------------------------------------------------===// -let Predicates = [isGCN] in { - -def : Pat < +def : GCNPat < (int_amdgcn_ds_swizzle i32:$src, imm:$offset16), (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0)) >; -class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat < +class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), (inst $ptr, (as_i16imm $offset), (i1 0)) >; -def : DSReadPat <DS_READ_I8, i32, si_sextload_local_i8>; -def : DSReadPat <DS_READ_U8, i32, si_az_extload_local_i8>; -def : DSReadPat <DS_READ_I8, i16, si_sextload_local_i8>; -def : DSReadPat <DS_READ_U8, i16, si_az_extload_local_i8>; -def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>; -def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>; -def : DSReadPat <DS_READ_U16, i32, si_az_extload_local_i16>; -def : DSReadPat <DS_READ_U16, i16, si_load_local>; -def : DSReadPat <DS_READ_B32, i32, si_load_local>; +// FIXME: Passing name of PatFrag in workaround. Why doesn't +// !cast<PatFrag>(frag.NAME#"_m0") work!? +multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> { + + let OtherPredicates = [LDSRequiresM0Init] in { + def : DSReadPat<inst, vt, !cast<PatFrag>(frag#"_m0")>; + } + + let OtherPredicates = [NotLDSRequiresM0Init] in { + def : DSReadPat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>; + } +} + + +multiclass DSReadPat_Hi16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> { + def : GCNPat < + (build_vector vt:$lo, (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))), + (v2i16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo)) + >; + + def : GCNPat < + (build_vector f16:$lo, (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))))), + (v2f16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo)) + >; +} + +multiclass DSReadPat_Lo16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> { + def : GCNPat < + (build_vector (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))), (vt (Hi16Elt vt:$hi))), + (v2i16 (inst $ptr, (as_i16imm $offset), 0, $hi)) + >; + + def : GCNPat < + (build_vector (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))))), (f16 (Hi16Elt f16:$hi))), + (v2f16 (inst $ptr, (as_i16imm $offset), 0, $hi)) + >; +} + +defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">; +defm : DSReadPat_mc <DS_READ_U8, i32, "az_extloadi8_local">; +defm : DSReadPat_mc <DS_READ_I8, i16, "sextloadi8_local">; +defm : DSReadPat_mc <DS_READ_U8, i16, "az_extloadi8_local">; +defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">; +defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">; +defm : DSReadPat_mc <DS_READ_U16, i32, "az_extloadi16_local">; +defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">; +defm : DSReadPat_mc <DS_READ_B32, i32, "load_local">; let AddedComplexity = 100 in { -def : DSReadPat <DS_READ_B64, v2i32, si_load_local_align8>; +defm : DSReadPat_mc <DS_READ_B64, v2i32, "load_align8_local">; } // End AddedComplexity = 100 -def : Pat < - (v2i32 (si_load_local (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, - i8:$offset1))), - (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0)) ->; +let OtherPredicates = [HasD16LoadStore] in { +let AddedComplexity = 100 in { +defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>; +defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>; +defm : DSReadPat_Hi16<DS_READ_I8_D16_HI, sextloadi8_local>; + +defm : DSReadPat_Lo16<DS_READ_U16_D16, load_local>; +defm : DSReadPat_Lo16<DS_READ_U8_D16, az_extloadi8_local>; +defm : DSReadPat_Lo16<DS_READ_I8_D16, sextloadi8_local>; + +} +} -class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat < +class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)), (inst $ptr, $value, (as_i16imm $offset), (i1 0)) >; -def : DSWritePat <DS_WRITE_B8, i32, si_truncstore_local_i8>; -def : DSWritePat <DS_WRITE_B16, i32, si_truncstore_local_i16>; -def : DSWritePat <DS_WRITE_B8, i16, si_truncstore_local_i8>; -def : DSWritePat <DS_WRITE_B16, i16, si_store_local>; -def : DSWritePat <DS_WRITE_B32, i32, si_store_local>; +multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> { + let OtherPredicates = [LDSRequiresM0Init] in { + def : DSWritePat<inst, vt, !cast<PatFrag>(frag#"_m0")>; + } -let AddedComplexity = 100 in { + let OtherPredicates = [NotLDSRequiresM0Init] in { + def : DSWritePat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>; + } +} -def : DSWritePat <DS_WRITE_B64, v2i32, si_store_local_align8>; -} // End AddedComplexity = 100 +defm : DSWritePat_mc <DS_WRITE_B8, i32, "truncstorei8_local">; +defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">; +defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">; +defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">; +defm : DSWritePat_mc <DS_WRITE_B32, i32, "store_local">; + +let OtherPredicates = [HasD16LoadStore] in { +def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_local_hi16>; +def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_local_hi16>; +} -def : Pat < - (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, - i8:$offset1)), - (DS_WRITE2_B32 $ptr, (i32 (EXTRACT_SUBREG $value, sub0)), - (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1, - (i1 0)) + +class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, PatFrag frag> : GCNPat < + (v2i32 (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))), + (inst $ptr, $offset0, $offset1, (i1 0)) +>; + +class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, PatFrag frag> : GCNPat< + (frag v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)), + (inst $ptr, (i32 (EXTRACT_SUBREG $value, sub0)), + (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1, + (i1 0)) >; -class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat < +let OtherPredicates = [LDSRequiresM0Init] in { +def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, load_local_m0>; +def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, store_local_m0>; +} + +let OtherPredicates = [NotLDSRequiresM0Init] in { +def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32_gfx9, load_local>; +def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>; +} + + +let AddedComplexity = 100 in { + +defm : DSWritePat_mc <DS_WRITE_B64, v2i32, "store_align8_local">; +} // End AddedComplexity = 100 +class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value), (inst $ptr, $value, (as_i16imm $offset), (i1 0)) >; -class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat < +multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> { + let OtherPredicates = [LDSRequiresM0Init] in { + def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_m0")>; + } + + let OtherPredicates = [NotLDSRequiresM0Init] in { + def : DSAtomicRetPat<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>; + } +} + + + +class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0)) >; +multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> { + let OtherPredicates = [LDSRequiresM0Init] in { + def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_m0")>; + } + + let OtherPredicates = [NotLDSRequiresM0Init] in { + def : DSAtomicCmpXChg<!cast<DS_Pseudo>(inst.NAME#"_gfx9"), vt, !cast<PatFrag>(frag)>; + } +} + + // 32-bit atomics. -def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>; -def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>; -def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, si_atomic_load_sub_local>; -def : DSAtomicRetPat<DS_INC_RTN_U32, i32, si_atomic_inc_local>; -def : DSAtomicRetPat<DS_DEC_RTN_U32, i32, si_atomic_dec_local>; -def : DSAtomicRetPat<DS_AND_RTN_B32, i32, si_atomic_load_and_local>; -def : DSAtomicRetPat<DS_OR_RTN_B32, i32, si_atomic_load_or_local>; -def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, si_atomic_load_xor_local>; -def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, si_atomic_load_min_local>; -def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, si_atomic_load_max_local>; -def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, si_atomic_load_umin_local>; -def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, si_atomic_load_umax_local>; -def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>; +defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B32, i32, "atomic_swap_local">; +defm : DSAtomicRetPat_mc<DS_ADD_RTN_U32, i32, "atomic_load_add_local">; +defm : DSAtomicRetPat_mc<DS_SUB_RTN_U32, i32, "atomic_load_sub_local">; +defm : DSAtomicRetPat_mc<DS_INC_RTN_U32, i32, "atomic_inc_local">; +defm : DSAtomicRetPat_mc<DS_DEC_RTN_U32, i32, "atomic_dec_local">; +defm : DSAtomicRetPat_mc<DS_AND_RTN_B32, i32, "atomic_load_and_local">; +defm : DSAtomicRetPat_mc<DS_OR_RTN_B32, i32, "atomic_load_or_local">; +defm : DSAtomicRetPat_mc<DS_XOR_RTN_B32, i32, "atomic_load_xor_local">; +defm : DSAtomicRetPat_mc<DS_MIN_RTN_I32, i32, "atomic_load_min_local">; +defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max_local">; +defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin_local">; +defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax_local">; +defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap_local">; // 64-bit atomics. -def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>; -def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>; -def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, si_atomic_load_sub_local>; -def : DSAtomicRetPat<DS_INC_RTN_U64, i64, si_atomic_inc_local>; -def : DSAtomicRetPat<DS_DEC_RTN_U64, i64, si_atomic_dec_local>; -def : DSAtomicRetPat<DS_AND_RTN_B64, i64, si_atomic_load_and_local>; -def : DSAtomicRetPat<DS_OR_RTN_B64, i64, si_atomic_load_or_local>; -def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, si_atomic_load_xor_local>; -def : DSAtomicRetPat<DS_MIN_RTN_I64, i64, si_atomic_load_min_local>; -def : DSAtomicRetPat<DS_MAX_RTN_I64, i64, si_atomic_load_max_local>; -def : DSAtomicRetPat<DS_MIN_RTN_U64, i64, si_atomic_load_umin_local>; -def : DSAtomicRetPat<DS_MAX_RTN_U64, i64, si_atomic_load_umax_local>; - -def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, si_atomic_cmp_swap_64_local>; - -} // let Predicates = [isGCN] +defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap_local">; +defm : DSAtomicRetPat_mc<DS_ADD_RTN_U64, i64, "atomic_load_add_local">; +defm : DSAtomicRetPat_mc<DS_SUB_RTN_U64, i64, "atomic_load_sub_local">; +defm : DSAtomicRetPat_mc<DS_INC_RTN_U64, i64, "atomic_inc_local">; +defm : DSAtomicRetPat_mc<DS_DEC_RTN_U64, i64, "atomic_dec_local">; +defm : DSAtomicRetPat_mc<DS_AND_RTN_B64, i64, "atomic_load_and_local">; +defm : DSAtomicRetPat_mc<DS_OR_RTN_B64, i64, "atomic_load_or_local">; +defm : DSAtomicRetPat_mc<DS_XOR_RTN_B64, i64, "atomic_load_xor_local">; +defm : DSAtomicRetPat_mc<DS_MIN_RTN_I64, i64, "atomic_load_min_local">; +defm : DSAtomicRetPat_mc<DS_MAX_RTN_I64, i64, "atomic_load_max_local">; +defm : DSAtomicRetPat_mc<DS_MIN_RTN_U64, i64, "atomic_load_umin_local">; +defm : DSAtomicRetPat_mc<DS_MAX_RTN_U64, i64, "atomic_load_umax_local">; + +defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B64, i64, "atomic_cmp_swap_local">; //===----------------------------------------------------------------------===// // Real instructions @@ -834,6 +1010,7 @@ def DS_GWS_SEMA_V_vi : DS_Real_vi<0x9a, DS_GWS_SEMA_V>; def DS_GWS_SEMA_BR_vi : DS_Real_vi<0x9b, DS_GWS_SEMA_BR>; def DS_GWS_SEMA_P_vi : DS_Real_vi<0x9c, DS_GWS_SEMA_P>; def DS_GWS_BARRIER_vi : DS_Real_vi<0x9d, DS_GWS_BARRIER>; +def DS_WRITE_ADDTID_B32_vi : DS_Real_vi<0x1d, DS_WRITE_ADDTID_B32>; def DS_WRITE_B8_vi : DS_Real_vi<0x1e, DS_WRITE_B8>; def DS_WRITE_B16_vi : DS_Real_vi<0x1f, DS_WRITE_B16>; def DS_ADD_RTN_U32_vi : DS_Real_vi<0x20, DS_ADD_RTN_U32>; @@ -865,6 +1042,7 @@ def DS_READ_I8_vi : DS_Real_vi<0x39, DS_READ_I8>; def DS_READ_U8_vi : DS_Real_vi<0x3a, DS_READ_U8>; def DS_READ_I16_vi : DS_Real_vi<0x3b, DS_READ_I16>; def DS_READ_U16_vi : DS_Real_vi<0x3c, DS_READ_U16>; +def DS_READ_ADDTID_B32_vi : DS_Real_vi<0xb6, DS_READ_ADDTID_B32>; def DS_CONSUME_vi : DS_Real_vi<0xbd, DS_CONSUME>; def DS_APPEND_vi : DS_Real_vi<0xbe, DS_APPEND>; def DS_ORDERED_COUNT_vi : DS_Real_vi<0xbf, DS_ORDERED_COUNT>; @@ -893,6 +1071,16 @@ def DS_CMPST_F64_vi : DS_Real_vi<0x51, DS_CMPST_F64>; def DS_MIN_F64_vi : DS_Real_vi<0x52, DS_MIN_F64>; def DS_MAX_F64_vi : DS_Real_vi<0x53, DS_MAX_F64>; +def DS_WRITE_B8_D16_HI_vi : DS_Real_vi<0x54, DS_WRITE_B8_D16_HI>; +def DS_WRITE_B16_D16_HI_vi : DS_Real_vi<0x55, DS_WRITE_B16_D16_HI>; + +def DS_READ_U8_D16_vi : DS_Real_vi<0x56, DS_READ_U8_D16>; +def DS_READ_U8_D16_HI_vi : DS_Real_vi<0x57, DS_READ_U8_D16_HI>; +def DS_READ_I8_D16_vi : DS_Real_vi<0x58, DS_READ_I8_D16>; +def DS_READ_I8_D16_HI_vi : DS_Real_vi<0x59, DS_READ_I8_D16_HI>; +def DS_READ_U16_D16_vi : DS_Real_vi<0x5a, DS_READ_U16_D16>; +def DS_READ_U16_D16_HI_vi : DS_Real_vi<0x5b, DS_READ_U16_D16_HI>; + def DS_ADD_RTN_U64_vi : DS_Real_vi<0x60, DS_ADD_RTN_U64>; def DS_SUB_RTN_U64_vi : DS_Real_vi<0x61, DS_SUB_RTN_U64>; def DS_RSUB_RTN_U64_vi : DS_Real_vi<0x62, DS_RSUB_RTN_U64>; diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 966c6fec20c6..4a3f2c975179 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -1,4 +1,4 @@ -//===-- AMDGPUDisassembler.cpp - Disassembler for AMDGPU ISA --------------===// +//===- AMDGPUDisassembler.cpp - Disassembler for AMDGPU ISA ---------------===// // // The LLVM Compiler Infrastructure // @@ -17,29 +17,40 @@ // ToDo: What to do with instruction suffixes (v_mov_b32 vs v_mov_b32_e32)? -#include "AMDGPUDisassembler.h" +#include "Disassembler/AMDGPUDisassembler.h" #include "AMDGPU.h" #include "AMDGPURegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" #include "Utils/AMDGPUBaseInfo.h" - +#include "llvm-c/Disassembler.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCDisassembler/MCDisassembler.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixedLenDisassembler.h" #include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/Debug.h" #include "llvm/Support/Endian.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <iterator> +#include <tuple> +#include <vector> using namespace llvm; #define DEBUG_TYPE "amdgpu-disassembler" -typedef llvm::MCDisassembler::DecodeStatus DecodeStatus; - +using DecodeStatus = llvm::MCDisassembler::DecodeStatus; inline static MCDisassembler::DecodeStatus addOperand(MCInst &Inst, const MCOperand& Opnd) { @@ -95,13 +106,13 @@ DECODE_OPERAND_REG(VReg_128) DECODE_OPERAND_REG(SReg_32) DECODE_OPERAND_REG(SReg_32_XM0_XEXEC) +DECODE_OPERAND_REG(SReg_32_XEXEC_HI) DECODE_OPERAND_REG(SReg_64) DECODE_OPERAND_REG(SReg_64_XEXEC) DECODE_OPERAND_REG(SReg_128) DECODE_OPERAND_REG(SReg_256) DECODE_OPERAND_REG(SReg_512) - static DecodeStatus decodeOperand_VSrc16(MCInst &Inst, unsigned Imm, uint64_t Addr, @@ -201,12 +212,18 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address); if (Res) break; + Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address); + if (Res) break; + if (Bytes.size() < 4) break; const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW; Res = tryDecodeInst(DecoderTableVI64, MI, QW, Address); if (Res) break; Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address); + if (Res) break; + + Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address); } while (false); if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi || @@ -217,6 +234,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, AMDGPU::OpName::src2_modifiers); } + if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::MIMG)) { + Res = convertMIMGInst(MI); + } + if (Res && IsSDWA) Res = convertSDWAInst(MI); @@ -233,7 +254,7 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { int SDst = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst); if (SDst != -1) { // VOPC - insert VCC register as sdst - insertNamedMCOperand(MI, MCOperand::createReg(AMDGPU::VCC), + insertNamedMCOperand(MI, createRegOperand(AMDGPU::VCC), AMDGPU::OpName::sdst); } else { // VOP1/2 - insert omod if present in instruction @@ -243,6 +264,42 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const { return MCDisassembler::Success; } +DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const { + int VDataIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::vdata); + + int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::dmask); + unsigned DMask = MI.getOperand(DMaskIdx).getImm() & 0xf; + if (DMask == 0) + return MCDisassembler::Success; + + unsigned ChannelCount = countPopulation(DMask); + if (ChannelCount == 1) + return MCDisassembler::Success; + + int NewOpcode = AMDGPU::getMaskedMIMGOp(*MCII, MI.getOpcode(), ChannelCount); + assert(NewOpcode != -1 && "could not find matching mimg channel instruction"); + auto RCID = MCII->get(NewOpcode).OpInfo[VDataIdx].RegClass; + + // Widen the register to the correct number of enabled channels. + unsigned Vdata0 = MI.getOperand(VDataIdx).getReg(); + auto NewVdata = MRI.getMatchingSuperReg(Vdata0, AMDGPU::sub0, + &MRI.getRegClass(RCID)); + if (NewVdata == AMDGPU::NoRegister) { + // It's possible to encode this such that the low register + enabled + // components exceeds the register count. + return MCDisassembler::Success; + } + + MI.setOpcode(NewOpcode); + // vaddr will be always appear as a single VGPR. This will look different than + // how it is usually emitted because the number of register components is not + // in the instruction encoding. + MI.getOperand(VDataIdx) = MCOperand::createReg(NewVdata); + return MCDisassembler::Success; +} + const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const { return getContext().getRegisterInfo()-> getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]); @@ -260,7 +317,7 @@ MCOperand AMDGPUDisassembler::errOperand(unsigned V, inline MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const { - return MCOperand::createReg(RegId); + return MCOperand::createReg(AMDGPU::getMCReg(RegId, STI)); } inline @@ -365,6 +422,12 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XM0_XEXEC( return decodeOperand_SReg_32(Val); } +MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XEXEC_HI( + unsigned Val) const { + // SReg_32_XM0 is SReg_32 without EXEC_HI + return decodeOperand_SReg_32(Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_SReg_64(unsigned Val) const { return decodeSrcOp(OPW64, Val); } @@ -385,7 +448,6 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const { return createSRegOperand(AMDGPU::SReg_512RegClassID, Val); } - MCOperand AMDGPUDisassembler::decodeLiteralConstant() const { // For now all literal constants are supposed to be unsigned integer // ToDo: deal with signed/unsigned 64-bit integer constants @@ -403,6 +465,7 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant() const { MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) { using namespace AMDGPU::EncValues; + assert(Imm >= INLINE_INTEGER_C_MIN && Imm <= INLINE_INTEGER_C_MAX); return MCOperand::createImm((Imm <= INLINE_INTEGER_C_POSITIVE_MAX) ? (static_cast<int64_t>(Imm) - INLINE_INTEGER_C_MIN) : @@ -505,6 +568,7 @@ MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) { unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const { using namespace AMDGPU; + assert(OPW_FIRST_ <= Width && Width < OPW_LAST_); switch (Width) { default: // fall @@ -519,6 +583,7 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const { unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const { using namespace AMDGPU; + assert(OPW_FIRST_ <= Width && Width < OPW_LAST_); switch (Width) { default: // fall @@ -533,6 +598,7 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const { unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const { using namespace AMDGPU; + assert(OPW_FIRST_ <= Width && Width < OPW_LAST_); switch (Width) { default: // fall @@ -545,8 +611,18 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const { } } +int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const { + using namespace AMDGPU::EncValues; + + unsigned TTmpMin = isGFX9() ? TTMP_GFX9_MIN : TTMP_VI_MIN; + unsigned TTmpMax = isGFX9() ? TTMP_GFX9_MAX : TTMP_VI_MAX; + + return (TTmpMin <= Val && Val <= TTmpMax)? Val - TTmpMin : -1; +} + MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) const { using namespace AMDGPU::EncValues; + assert(Val < 512); // enum9 if (VGPR_MIN <= Val && Val <= VGPR_MAX) { @@ -556,8 +632,10 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning. return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN); } - if (TTMP_MIN <= Val && Val <= TTMP_MAX) { - return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN); + + int TTmpIdx = getTTmpIdx(Val); + if (TTmpIdx >= 0) { + return createSRegOperand(getTtmpClassId(Width), TTmpIdx); } if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX) @@ -583,18 +661,19 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const { using namespace AMDGPU; + switch (Val) { - case 102: return createRegOperand(getMCReg(FLAT_SCR_LO, STI)); - case 103: return createRegOperand(getMCReg(FLAT_SCR_HI, STI)); + case 102: return createRegOperand(FLAT_SCR_LO); + case 103: return createRegOperand(FLAT_SCR_HI); // ToDo: no support for xnack_mask_lo/_hi register case 104: case 105: break; case 106: return createRegOperand(VCC_LO); case 107: return createRegOperand(VCC_HI); - case 108: return createRegOperand(TBA_LO); - case 109: return createRegOperand(TBA_HI); - case 110: return createRegOperand(TMA_LO); - case 111: return createRegOperand(TMA_HI); + case 108: assert(!isGFX9()); return createRegOperand(TBA_LO); + case 109: assert(!isGFX9()); return createRegOperand(TBA_HI); + case 110: assert(!isGFX9()); return createRegOperand(TMA_LO); + case 111: assert(!isGFX9()); return createRegOperand(TMA_HI); case 124: return createRegOperand(M0); case 126: return createRegOperand(EXEC_LO); case 127: return createRegOperand(EXEC_HI); @@ -615,11 +694,12 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const { MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { using namespace AMDGPU; + switch (Val) { - case 102: return createRegOperand(getMCReg(FLAT_SCR, STI)); + case 102: return createRegOperand(FLAT_SCR); case 106: return createRegOperand(VCC); - case 108: return createRegOperand(TBA); - case 110: return createRegOperand(TMA); + case 108: assert(!isGFX9()); return createRegOperand(TBA); + case 110: assert(!isGFX9()); return createRegOperand(TMA); case 126: return createRegOperand(EXEC); default: break; } @@ -643,6 +723,11 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width, return createSRegOperand(getSgprClassId(Width), Val - SDWA9EncValues::SRC_SGPR_MIN); } + if (SDWA9EncValues::SRC_TTMP_MIN <= Val && + Val <= SDWA9EncValues::SRC_TTMP_MAX) { + return createSRegOperand(getTtmpClassId(Width), + Val - SDWA9EncValues::SRC_TTMP_MIN); + } return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN); } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) { @@ -659,7 +744,6 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const { return decodeSDWASrc(OPW32, Val); } - MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const { using namespace AMDGPU::SDWA; @@ -667,7 +751,11 @@ MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const { "SDWAVopcDst should be present only on GFX9"); if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) { Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK; - if (Val > AMDGPU::EncValues::SGPR_MAX) { + + int TTmpIdx = getTTmpIdx(Val); + if (TTmpIdx >= 0) { + return createSRegOperand(getTtmpClassId(OPW64), TTmpIdx); + } else if (Val > AMDGPU::EncValues::SGPR_MAX) { return decodeSpecialReg64(Val); } else { return createSRegOperand(getSgprClassId(OPW64), Val); @@ -677,6 +765,14 @@ MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const { } } +bool AMDGPUDisassembler::isVI() const { + return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]; +} + +bool AMDGPUDisassembler::isGFX9() const { + return STI.getFeatureBits()[AMDGPU::FeatureGFX9]; +} + //===----------------------------------------------------------------------===// // AMDGPUSymbolizer //===----------------------------------------------------------------------===// @@ -686,8 +782,8 @@ bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst, raw_ostream &/*cStream*/, int64_t Value, uint64_t /*Address*/, bool IsBranch, uint64_t /*Offset*/, uint64_t /*InstSize*/) { - typedef std::tuple<uint64_t, StringRef, uint8_t> SymbolInfoTy; - typedef std::vector<SymbolInfoTy> SectionSymbolsTy; + using SymbolInfoTy = std::tuple<uint64_t, StringRef, uint8_t>; + using SectionSymbolsTy = std::vector<SymbolInfoTy>; if (!IsBranch) { return false; @@ -730,7 +826,7 @@ static MCSymbolizer *createAMDGPUSymbolizer(const Triple &/*TT*/, static MCDisassembler *createAMDGPUDisassembler(const Target &T, const MCSubtargetInfo &STI, MCContext &Ctx) { - return new AMDGPUDisassembler(STI, Ctx); + return new AMDGPUDisassembler(STI, Ctx, T.createMCInstrInfo()); } extern "C" void LLVMInitializeAMDGPUDisassembler() { diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 4c755be09999..ce396eb68c4c 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -1,4 +1,4 @@ -//===-- AMDGPUDisassembler.hpp - Disassembler for AMDGPU ISA ---*- C++ -*--===// +//===- AMDGPUDisassembler.hpp - Disassembler for AMDGPU ISA -----*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -17,16 +17,18 @@ #define LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCDisassembler/MCDisassembler.h" #include "llvm/MC/MCDisassembler/MCRelocationInfo.h" #include "llvm/MC/MCDisassembler/MCSymbolizer.h" + #include <algorithm> #include <cstdint> #include <memory> namespace llvm { -class MCContext; class MCInst; class MCOperand; class MCSubtargetInfo; @@ -38,13 +40,16 @@ class Twine; class AMDGPUDisassembler : public MCDisassembler { private: + std::unique_ptr<MCInstrInfo const> const MCII; + const MCRegisterInfo &MRI; mutable ArrayRef<uint8_t> Bytes; mutable uint32_t Literal; mutable bool HasLiteral; public: - AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) : - MCDisassembler(STI, Ctx) {} + AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, + MCInstrInfo const *MCII) : + MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()) {} ~AMDGPUDisassembler() override = default; @@ -60,12 +65,11 @@ public: MCOperand errOperand(unsigned V, const Twine& ErrMsg) const; - DecodeStatus tryDecodeInst(const uint8_t* Table, - MCInst &MI, - uint64_t Inst, - uint64_t Address) const; + DecodeStatus tryDecodeInst(const uint8_t* Table, MCInst &MI, uint64_t Inst, + uint64_t Address) const; DecodeStatus convertSDWAInst(MCInst &MI) const; + DecodeStatus convertMIMGInst(MCInst &MI) const; MCOperand decodeOperand_VGPR_32(unsigned Val) const; MCOperand decodeOperand_VS_32(unsigned Val) const; @@ -80,6 +84,7 @@ public: MCOperand decodeOperand_SReg_32(unsigned Val) const; MCOperand decodeOperand_SReg_32_XM0_XEXEC(unsigned Val) const; + MCOperand decodeOperand_SReg_32_XEXEC_HI(unsigned Val) const; MCOperand decodeOperand_SReg_64(unsigned Val) const; MCOperand decodeOperand_SReg_64_XEXEC(unsigned Val) const; MCOperand decodeOperand_SReg_128(unsigned Val) const; @@ -112,7 +117,12 @@ public: MCOperand decodeSDWASrc16(unsigned Val) const; MCOperand decodeSDWASrc32(unsigned Val) const; MCOperand decodeSDWAVopcDst(unsigned Val) const; -}; + + int getTTmpIdx(unsigned Val) const; + + bool isVI() const; + bool isGFX9() const; + }; //===----------------------------------------------------------------------===// // AMDGPUSymbolizer diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td index 5480110d8315..5e26f97b0c86 100644 --- a/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/lib/Target/AMDGPU/EvergreenInstructions.td @@ -15,20 +15,28 @@ def isEG : Predicate< "Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && " - "Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && " + "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && " "!Subtarget->hasCaymanISA()" >; def isEGorCayman : Predicate< "Subtarget->getGeneration() == AMDGPUSubtarget::EVERGREEN ||" - "Subtarget->getGeneration() ==AMDGPUSubtarget::NORTHERN_ISLANDS" + "Subtarget->getGeneration() == AMDGPUSubtarget::NORTHERN_ISLANDS" >; +class EGPat<dag pattern, dag result> : AMDGPUPat<pattern, result> { + let SubtargetPredicate = isEG; +} + +class EGOrCaymanPat<dag pattern, dag result> : AMDGPUPat<pattern, result> { + let SubtargetPredicate = isEGorCayman; +} + //===----------------------------------------------------------------------===// // Evergreen / Cayman store instructions //===----------------------------------------------------------------------===// -let Predicates = [isEGorCayman] in { +let SubtargetPredicate = isEGorCayman in { class CF_MEM_RAT_CACHELESS <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag ins, string name, list<dag> pattern> @@ -88,13 +96,13 @@ defm RAT_ATOMIC_XOR : RAT_ATOMIC<16, 48, "ATOMIC_XOR">; defm RAT_ATOMIC_INC_UINT : RAT_ATOMIC<18, 50, "ATOMIC_INC_UINT">; defm RAT_ATOMIC_DEC_UINT : RAT_ATOMIC<19, 51, "ATOMIC_DEC_UINT">; -} // End let Predicates = [isEGorCayman] +} // End SubtargetPredicate = isEGorCayman //===----------------------------------------------------------------------===// // Evergreen Only instructions //===----------------------------------------------------------------------===// -let Predicates = [isEG] in { +let SubtargetPredicate = isEG in { def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>; defm DIV_eg : DIV_Common<RECIP_IEEE_eg>; @@ -116,7 +124,8 @@ def SIN_eg : SIN_Common<0x8D>; def COS_eg : COS_Common<0x8E>; def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>; -def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>; +def : EGPat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>; +} // End SubtargetPredicate = isEG //===----------------------------------------------------------------------===// // Memory read/write instructions @@ -128,21 +137,21 @@ let usesCustomInserter = 1 in { def RAT_WRITE_CACHELESS_32_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x1, (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), "STORE_RAW $rw_gpr, $index_gpr, $eop", - [(global_store i32:$rw_gpr, i32:$index_gpr)] + [(store_global i32:$rw_gpr, i32:$index_gpr)] >; // 64-bit store def RAT_WRITE_CACHELESS_64_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x3, (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), "STORE_RAW $rw_gpr.XY, $index_gpr, $eop", - [(global_store v2i32:$rw_gpr, i32:$index_gpr)] + [(store_global v2i32:$rw_gpr, i32:$index_gpr)] >; //128-bit store def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf, (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop), "STORE_RAW $rw_gpr.XYZW, $index_gpr, $eop", - [(global_store v4i32:$rw_gpr, i32:$index_gpr)] + [(store_global v4i32:$rw_gpr, i32:$index_gpr)] >; def RAT_STORE_TYPED_eg: CF_MEM_RAT_STORE_TYPED<1>; @@ -203,8 +212,8 @@ def VTX_READ_32_eg // to be caused by ALU instructions in the next instruction group that wrote // to the $src_gpr registers of the VTX_READ. // e.g. - // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24 - // %T2_X<def> = MOV %ZERO + // %t3_x = VTX_READ_PARAM_32_eg killed %t2_x, 24 + // %t2_x = MOV %zero //Adding this constraint prevents this from happening. let Constraints = "$src_gpr.ptr = $dst_gpr"; } @@ -241,58 +250,56 @@ def VTX_READ_128_eg //===----------------------------------------------------------------------===// // VTX Read from parameter memory space //===----------------------------------------------------------------------===// -def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi8 ADDRVTX_READ:$src_gpr)), +def : EGPat<(i32:$dst_gpr (vtx_id3_az_extloadi8 ADDRVTX_READ:$src_gpr)), (VTX_READ_8_eg MEMxi:$src_gpr, 3)>; -def : Pat<(i32:$dst_gpr (vtx_id3_az_extloadi16 ADDRVTX_READ:$src_gpr)), +def : EGPat<(i32:$dst_gpr (vtx_id3_az_extloadi16 ADDRVTX_READ:$src_gpr)), (VTX_READ_16_eg MEMxi:$src_gpr, 3)>; -def : Pat<(i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)), +def : EGPat<(i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)), (VTX_READ_32_eg MEMxi:$src_gpr, 3)>; -def : Pat<(v2i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)), +def : EGPat<(v2i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)), (VTX_READ_64_eg MEMxi:$src_gpr, 3)>; -def : Pat<(v4i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)), +def : EGPat<(v4i32:$dst_gpr (vtx_id3_load ADDRVTX_READ:$src_gpr)), (VTX_READ_128_eg MEMxi:$src_gpr, 3)>; //===----------------------------------------------------------------------===// // VTX Read from constant memory space //===----------------------------------------------------------------------===// -def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr)), +def : EGPat<(i32:$dst_gpr (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr)), (VTX_READ_8_eg MEMxi:$src_gpr, 2)>; -def : Pat<(i32:$dst_gpr (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr)), +def : EGPat<(i32:$dst_gpr (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr)), (VTX_READ_16_eg MEMxi:$src_gpr, 2)>; -def : Pat<(i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)), +def : EGPat<(i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)), (VTX_READ_32_eg MEMxi:$src_gpr, 2)>; -def : Pat<(v2i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)), +def : EGPat<(v2i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)), (VTX_READ_64_eg MEMxi:$src_gpr, 2)>; -def : Pat<(v4i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)), +def : EGPat<(v4i32:$dst_gpr (vtx_id2_load ADDRVTX_READ:$src_gpr)), (VTX_READ_128_eg MEMxi:$src_gpr, 2)>; //===----------------------------------------------------------------------===// // VTX Read from global memory space //===----------------------------------------------------------------------===// -def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr)), +def : EGPat<(i32:$dst_gpr (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr)), (VTX_READ_8_eg MEMxi:$src_gpr, 1)>; -def : Pat<(i32:$dst_gpr (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr)), +def : EGPat<(i32:$dst_gpr (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr)), (VTX_READ_16_eg MEMxi:$src_gpr, 1)>; -def : Pat<(i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), +def : EGPat<(i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), (VTX_READ_32_eg MEMxi:$src_gpr, 1)>; -def : Pat<(v2i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), +def : EGPat<(v2i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), (VTX_READ_64_eg MEMxi:$src_gpr, 1)>; -def : Pat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), +def : EGPat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)), (VTX_READ_128_eg MEMxi:$src_gpr, 1)>; -} // End Predicates = [isEG] - //===----------------------------------------------------------------------===// // Evergreen / Cayman Instructions //===----------------------------------------------------------------------===// -let Predicates = [isEGorCayman] in { +let SubtargetPredicate = isEGorCayman in { multiclass AtomicPat<Instruction inst_ret, Instruction inst_noret, SDPatternOperator node_ret, SDPatternOperator node_noret> { // FIXME: Add _RTN version. We need per WI scratch location to store the old value // EXTRACT_SUBREG here is dummy, we know the node has no uses - def : Pat<(i32 (node_noret i32:$ptr, i32:$data)), + def : EGOrCaymanPat<(i32 (node_noret i32:$ptr, i32:$data)), (EXTRACT_SUBREG (inst_noret (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), $data, sub0), $ptr), sub1)>; } @@ -300,7 +307,7 @@ multiclass AtomicIncDecPat<Instruction inst_ret, Instruction inst_noret, SDPatternOperator node_ret, SDPatternOperator node_noret, int C> { // FIXME: Add _RTN version. We need per WI scratch location to store the old value // EXTRACT_SUBREG here is dummy, we know the node has no uses - def : Pat<(i32 (node_noret i32:$ptr, C)), + def : EGOrCaymanPat<(i32 (node_noret i32:$ptr, C)), (EXTRACT_SUBREG (inst_noret (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (MOV_IMM_I32 -1), sub0), $ptr), sub1)>; } @@ -308,7 +315,7 @@ multiclass AtomicIncDecPat<Instruction inst_ret, Instruction inst_noret, // CMPSWAP is pattern is special // EXTRACT_SUBREG here is dummy, we know the node has no uses // FIXME: Add _RTN version. We need per WI scratch location to store the old value -def : Pat<(i32 (atomic_cmp_swap_global_noret i32:$ptr, i32:$cmp, i32:$data)), +def : EGOrCaymanPat<(i32 (atomic_cmp_swap_global_noret i32:$ptr, i32:$cmp, i32:$data)), (EXTRACT_SUBREG (RAT_ATOMIC_CMPXCHG_INT_NORET (INSERT_SUBREG (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), $cmp, sub3), @@ -395,11 +402,11 @@ def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", VecALU >; -def : Pat<(i32 (sext_inreg i32:$src, i1)), +def : EGOrCaymanPat<(i32 (sext_inreg i32:$src, i1)), (BFE_INT_eg i32:$src, (i32 ZERO), (i32 ONE_INT))>; -def : Pat<(i32 (sext_inreg i32:$src, i8)), +def : EGOrCaymanPat<(i32 (sext_inreg i32:$src, i8)), (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 8))>; -def : Pat<(i32 (sext_inreg i32:$src, i16)), +def : EGOrCaymanPat<(i32 (sext_inreg i32:$src, i16)), (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>; defm : BFIPatterns <BFI_INT_eg, MOV_IMM_I32, R600_Reg64>; @@ -442,7 +449,7 @@ def FLT32_TO_FLT16 : R600_1OP_Helper <0xA2, "FLT32_TO_FLT16", AMDGPUfp_to_f16, V def FLT16_TO_FLT32 : R600_1OP_Helper <0xA3, "FLT16_TO_FLT32", f16_to_fp, VecALU>; def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>; def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>; -def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", cttz_zero_undef, VecALU>; +def FFBL_INT : R600_1OP_Helper <0xAC, "FFBL_INT", AMDGPUffbl_b32, VecALU>; let hasSideEffects = 1 in { def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>; @@ -614,7 +621,7 @@ def LDS_MAX_INT : R600_LDS_1A1D_NORET <0x6, "LDS_MAX_INT", [] >; def LDS_MIN_UINT : R600_LDS_1A1D_NORET <0x7, "LDS_MIN_UINT", [] >; def LDS_MAX_UINT : R600_LDS_1A1D_NORET <0x8, "LDS_MAX_UINT", [] >; def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE", - [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)] + [(store_local (i32 R600_Reg32:$src1), R600_Reg32:$src0)] >; def LDS_BYTE_WRITE : R600_LDS_1A1D_NORET<0x12, "LDS_BYTE_WRITE", [(truncstorei8_local i32:$src1, i32:$src0)] @@ -653,10 +660,10 @@ def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG", [(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))] >; def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST", - [(set i32:$dst, (atomic_cmp_swap_32_local i32:$src0, i32:$src1, i32:$src2))] + [(set i32:$dst, (atomic_cmp_swap_local i32:$src0, i32:$src1, i32:$src2))] >; def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET", - [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))] + [(set (i32 R600_Reg32:$dst), (load_local R600_Reg32:$src0))] >; def LDS_BYTE_READ_RET : R600_LDS_1A <0x36, "LDS_BYTE_READ_RET", [(set i32:$dst, (sextloadi8_local i32:$src0))] @@ -681,9 +688,9 @@ def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET", // XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes, // which do not need to be truncated since the fp values are 0.0f or 1.0f. // We should look into handling these cases separately. -def : Pat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>; +def : EGOrCaymanPat<(fp_to_sint f32:$src0), (FLT_TO_INT_eg (TRUNC $src0))>; -def : Pat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>; +def : EGOrCaymanPat<(fp_to_uint f32:$src0), (FLT_TO_UINT_eg (TRUNC $src0))>; // SHA-256 Patterns def : SHA256MaPattern <BFI_INT_eg, XOR_INT>; diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td index edca6fcd812c..693869128081 100644 --- a/lib/Target/AMDGPU/FLATInstructions.td +++ b/lib/Target/AMDGPU/FLATInstructions.td @@ -8,7 +8,10 @@ //===----------------------------------------------------------------------===// def FLATAtomic : ComplexPattern<i64, 3, "SelectFlatAtomic", [], [], -10>; -def FLATOffset : ComplexPattern<i64, 3, "SelectFlat", [], [], -10>; +def FLATOffset : ComplexPattern<i64, 3, "SelectFlatOffset<false>", [], [], -10>; + +def FLATOffsetSigned : ComplexPattern<i64, 3, "SelectFlatOffset<true>", [], [], -10>; +def FLATSignedAtomic : ComplexPattern<i64, 3, "SelectFlatAtomicSigned", [], [], -10>; //===----------------------------------------------------------------------===// // FLAT classes @@ -22,14 +25,7 @@ class FLAT_Pseudo<string opName, dag outs, dag ins, let isPseudo = 1; let isCodeGenOnly = 1; - let SubtargetPredicate = isCIVI; - let FLAT = 1; - // Internally, FLAT instruction are executed as both an LDS and a - // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT - // and are not considered done until both have been decremented. - let VM_CNT = 1; - let LGKM_CNT = 1; let UseNamedOperandTable = 1; let hasSideEffects = 0; @@ -42,12 +38,32 @@ class FLAT_Pseudo<string opName, dag outs, dag ins, bits<1> is_flat_scratch = 0; bits<1> has_vdst = 1; + + // We need to distinguish having saddr and enabling saddr because + // saddr is only valid for scratch and global instructions. Pre-gfx9 + // these bits were reserved, so we also don't necessarily want to + // set these bits to the disabled value for the original flat + // segment instructions. + bits<1> has_saddr = 0; + bits<1> enabled_saddr = 0; + bits<7> saddr_value = 0; + bits<1> has_vaddr = 1; + bits<1> has_data = 1; bits<1> has_glc = 1; bits<1> glcValue = 0; + let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts, + !if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace)); + // TODO: M0 if it could possibly access LDS (before gfx9? only)? let Uses = !if(is_flat_global, [EXEC], [EXEC, FLAT_SCR]); + + // Internally, FLAT instruction are executed as both an LDS and a + // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT + // and are not considered done until both have been decremented. + let VM_CNT = 1; + let LGKM_CNT = !if(!or(is_flat_global, is_flat_scratch), 0, 1); } class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : @@ -66,7 +82,9 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : // encoding fields bits<8> vaddr; bits<8> vdata; + bits<7> saddr; bits<8> vdst; + bits<1> slc; bits<1> glc; @@ -94,56 +112,143 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> : let Inst{17} = slc; let Inst{24-18} = op; let Inst{31-26} = 0x37; // Encoding. - let Inst{39-32} = vaddr; + let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?); let Inst{47-40} = !if(ps.has_data, vdata, ?); + let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7f), 0); + // 54-48 is reserved. let Inst{55} = nv; // nv on GFX9+, TFE before. let Inst{63-56} = !if(ps.has_vdst, vdst, ?); } +// TODO: Is exec allowed for saddr? The disabled value 0x7f is the +// same encoding value as exec_hi, so it isn't possible to use that if +// saddr is 32-bit (which isn't handled here yet). class FLAT_Load_Pseudo <string opName, RegisterClass regClass, - bit HasSignedOffset = 0> : FLAT_Pseudo< + bit HasTiedOutput = 0, + bit HasSignedOffset = 0, bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo< opName, (outs regClass:$vdst), - !if(HasSignedOffset, - (ins VReg_64:$vaddr, offset_s13:$offset, GLC:$glc, slc:$slc), - (ins VReg_64:$vaddr, offset_u12:$offset, GLC:$glc, slc:$slc)), - " $vdst, $vaddr$offset$glc$slc"> { + !con( + !con( + !con( + !con((ins VReg_64:$vaddr), + !if(EnableSaddr, (ins SReg_64:$saddr), (ins))), + (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)), + (ins GLC:$glc, slc:$slc)), + !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))), + " $vdst, $vaddr"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> { let has_data = 0; let mayLoad = 1; + let has_saddr = HasSaddr; + let enabled_saddr = EnableSaddr; + let PseudoInstr = opName#!if(!and(HasSaddr, EnableSaddr), "_SADDR", ""); + let maybeAtomic = 1; + + let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", ""); + let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); +} + +class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, + bit HasSignedOffset = 0, bit HasSaddr = 0, bit EnableSaddr = 0> : FLAT_Pseudo< + opName, + (outs), + !con( + !con( + !con((ins VReg_64:$vaddr, vdataClass:$vdata), + !if(EnableSaddr, (ins SReg_64:$saddr), (ins))), + (ins !if(HasSignedOffset,offset_s13,offset_u12):$offset)), + (ins GLC:$glc, slc:$slc)), + " $vaddr, $vdata"#!if(HasSaddr, !if(EnableSaddr, ", $saddr", ", off"), "")#"$offset$glc$slc"> { + let mayLoad = 0; + let mayStore = 1; + let has_vdst = 0; + let has_saddr = HasSaddr; + let enabled_saddr = EnableSaddr; + let PseudoInstr = opName#!if(!and(HasSaddr, EnableSaddr), "_SADDR", ""); + let maybeAtomic = 1; } -class FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass> : - FLAT_Load_Pseudo<opName, regClass, 1> { - let is_flat_global = 1; +multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> { + let is_flat_global = 1 in { + def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>; + def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>; + } } -class FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass> : - FLAT_Load_Pseudo<opName, regClass, 1> { - let is_flat_scratch = 1; +multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> { + let is_flat_global = 1 in { + def "" : FLAT_Store_Pseudo<opName, regClass, 1, 1>; + def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1, 1>; + } } -class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, - bit HasSignedOffset = 0> : FLAT_Pseudo< +class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass, + bit EnableSaddr = 0>: FLAT_Pseudo< + opName, + (outs regClass:$vdst), + !if(EnableSaddr, + (ins SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, slc:$slc), + (ins VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, slc:$slc)), + " $vdst, "#!if(EnableSaddr, "off", "$vaddr")#!if(EnableSaddr, ", $saddr", ", off")#"$offset$glc$slc"> { + let has_data = 0; + let mayLoad = 1; + let has_saddr = 1; + let enabled_saddr = EnableSaddr; + let has_vaddr = !if(EnableSaddr, 0, 1); + let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", ""); + let maybeAtomic = 1; +} + +class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit EnableSaddr = 0> : FLAT_Pseudo< opName, (outs), - !if(HasSignedOffset, - (ins VReg_64:$vaddr, vdataClass:$vdata, offset_s13:$offset, GLC:$glc, slc:$slc), - (ins VReg_64:$vaddr, vdataClass:$vdata, offset_u12:$offset, GLC:$glc, slc:$slc)), - " $vaddr, $vdata$offset$glc$slc"> { + !if(EnableSaddr, + (ins vdataClass:$vdata, SReg_32_XEXEC_HI:$saddr, offset_s13:$offset, GLC:$glc, slc:$slc), + (ins vdataClass:$vdata, VGPR_32:$vaddr, offset_s13:$offset, GLC:$glc, slc:$slc)), + " "#!if(EnableSaddr, "off", "$vaddr")#", $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc"> { let mayLoad = 0; let mayStore = 1; let has_vdst = 0; + let has_saddr = 1; + let enabled_saddr = EnableSaddr; + let has_vaddr = !if(EnableSaddr, 0, 1); + let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", ""); + let maybeAtomic = 1; } -class FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> : - FLAT_Store_Pseudo<opName, regClass, 1> { - let is_flat_global = 1; +multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass> { + let is_flat_scratch = 1 in { + def "" : FLAT_Scratch_Load_Pseudo<opName, regClass>; + def _SADDR : FLAT_Scratch_Load_Pseudo<opName, regClass, 1>; + } } -class FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> : - FLAT_Store_Pseudo<opName, regClass, 1> { - let is_flat_scratch = 1; +multiclass FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> { + let is_flat_scratch = 1 in { + def "" : FLAT_Scratch_Store_Pseudo<opName, regClass>; + def _SADDR : FLAT_Scratch_Store_Pseudo<opName, regClass, 1>; + } +} + +class FLAT_AtomicNoRet_Pseudo<string opName, dag outs, dag ins, + string asm, list<dag> pattern = []> : + FLAT_Pseudo<opName, outs, ins, asm, pattern> { + let mayLoad = 1; + let mayStore = 1; + let has_glc = 0; + let glcValue = 0; + let has_vdst = 0; + let maybeAtomic = 1; +} + +class FLAT_AtomicRet_Pseudo<string opName, dag outs, dag ins, + string asm, list<dag> pattern = []> + : FLAT_AtomicNoRet_Pseudo<opName, outs, ins, asm, pattern> { + let hasPostISelHook = 1; + let has_vdst = 1; + let glcValue = 1; + let PseudoInstr = NAME # "_RTN"; } multiclass FLAT_Atomic_Pseudo< @@ -152,40 +257,69 @@ multiclass FLAT_Atomic_Pseudo< ValueType vt, SDPatternOperator atomic = null_frag, ValueType data_vt = vt, - RegisterClass data_rc = vdst_rc, - bit HasSignedOffset = 0> { - - def "" : FLAT_Pseudo <opName, + RegisterClass data_rc = vdst_rc> { + def "" : FLAT_AtomicNoRet_Pseudo <opName, (outs), - !if(HasSignedOffset, - (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc), - (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc)), - " $vaddr, $vdata$offset$slc", - []>, - AtomicNoRet <NAME, 0> { - let mayLoad = 1; - let mayStore = 1; - let has_glc = 0; - let glcValue = 0; - let has_vdst = 0; + (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc), + " $vaddr, $vdata$offset$slc">, + AtomicNoRet <opName, 0> { let PseudoInstr = NAME; } - def _RTN : FLAT_Pseudo <opName, + def _RTN : FLAT_AtomicRet_Pseudo <opName, (outs vdst_rc:$vdst), - !if(HasSignedOffset, - (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc), - (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc)), + (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc), " $vdst, $vaddr, $vdata$offset glc$slc", [(set vt:$vdst, (atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>, - AtomicNoRet <NAME, 1> { - let mayLoad = 1; - let mayStore = 1; - let hasPostISelHook = 1; - let has_glc = 0; - let glcValue = 1; - let PseudoInstr = NAME # "_RTN"; + AtomicNoRet <opName, 1>; +} + +multiclass FLAT_Global_Atomic_Pseudo< + string opName, + RegisterClass vdst_rc, + ValueType vt, + SDPatternOperator atomic = null_frag, + ValueType data_vt = vt, + RegisterClass data_rc = vdst_rc> { + + def "" : FLAT_AtomicNoRet_Pseudo <opName, + (outs), + (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc), + " $vaddr, $vdata, off$offset$slc">, + AtomicNoRet <opName, 0> { + let has_saddr = 1; + let PseudoInstr = NAME; + } + + def _RTN : FLAT_AtomicRet_Pseudo <opName, + (outs vdst_rc:$vdst), + (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc), + " $vdst, $vaddr, $vdata, off$offset glc$slc", + [(set vt:$vdst, + (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>, + AtomicNoRet <opName, 1> { + let has_saddr = 1; + } + + def _SADDR : FLAT_AtomicNoRet_Pseudo <opName, + (outs), + (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, slc:$slc), + " $vaddr, $vdata, $saddr$offset$slc">, + AtomicNoRet <opName#"_saddr", 0> { + let has_saddr = 1; + let enabled_saddr = 1; + let PseudoInstr = NAME#"_SADDR"; + } + + def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName, + (outs vdst_rc:$vdst), + (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, slc:$slc), + " $vdst, $vaddr, $vdata, $saddr$offset glc$slc">, + AtomicNoRet <opName#"_saddr", 1> { + let has_saddr = 1; + let enabled_saddr = 1; + let PseudoInstr = NAME#"_SADDR_RTN"; } } @@ -231,6 +365,18 @@ def FLAT_STORE_DWORDX2 : FLAT_Store_Pseudo <"flat_store_dwordx2", VReg_64>; def FLAT_STORE_DWORDX4 : FLAT_Store_Pseudo <"flat_store_dwordx4", VReg_128>; def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>; +let SubtargetPredicate = HasD16LoadStore in { +def FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo <"flat_load_ubyte_d16", VGPR_32, 1>; +def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>; +def FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo <"flat_load_sbyte_d16", VGPR_32, 1>; +def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>; +def FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo <"flat_load_short_d16", VGPR_32, 1>; +def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>; + +def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>; +def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>; +} + defm FLAT_ATOMIC_CMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap", VGPR_32, i32, atomic_cmp_swap_flat, v2i32, VReg_64>; @@ -334,108 +480,274 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2", } // End SubtargetPredicate = isCI let SubtargetPredicate = HasFlatGlobalInsts in { -def GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>; -def GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>; -def GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>; -def GLOBAL_LOAD_SSHORT : FLAT_Global_Load_Pseudo <"global_load_sshort", VGPR_32>; -def GLOBAL_LOAD_DWORD : FLAT_Global_Load_Pseudo <"global_load_dword", VGPR_32>; -def GLOBAL_LOAD_DWORDX2 : FLAT_Global_Load_Pseudo <"global_load_dwordx2", VReg_64>; -def GLOBAL_LOAD_DWORDX3 : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg_96>; -def GLOBAL_LOAD_DWORDX4 : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>; - -def GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>; -def GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>; -def GLOBAL_STORE_DWORD : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR_32>; -def GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>; -def GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>; -def GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>; +defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>; +defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>; +defm GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>; +defm GLOBAL_LOAD_SSHORT : FLAT_Global_Load_Pseudo <"global_load_sshort", VGPR_32>; +defm GLOBAL_LOAD_DWORD : FLAT_Global_Load_Pseudo <"global_load_dword", VGPR_32>; +defm GLOBAL_LOAD_DWORDX2 : FLAT_Global_Load_Pseudo <"global_load_dwordx2", VReg_64>; +defm GLOBAL_LOAD_DWORDX3 : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg_96>; +defm GLOBAL_LOAD_DWORDX4 : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>; + +defm GLOBAL_LOAD_UBYTE_D16 : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16", VGPR_32, 1>; +defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16_hi", VGPR_32, 1>; +defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16", VGPR_32, 1>; +defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_hi", VGPR_32, 1>; +defm GLOBAL_LOAD_SHORT_D16 : FLAT_Global_Load_Pseudo <"global_load_short_d16", VGPR_32, 1>; +defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", VGPR_32, 1>; + +defm GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>; +defm GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>; +defm GLOBAL_STORE_DWORD : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR_32>; +defm GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>; +defm GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>; +defm GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>; + +defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi", VGPR_32>; +defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi", VGPR_32>; + +let is_flat_global = 1 in { +defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap", + VGPR_32, i32, AMDGPUatomic_cmp_swap_global, + v2i32, VReg_64>; + +defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap_x2", + VReg_64, i64, AMDGPUatomic_cmp_swap_global, + v2i64, VReg_128>; + +defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_swap", + VGPR_32, i32, atomic_swap_global>; + +defm GLOBAL_ATOMIC_SWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_swap_x2", + VReg_64, i64, atomic_swap_global>; + +defm GLOBAL_ATOMIC_ADD : FLAT_Global_Atomic_Pseudo <"global_atomic_add", + VGPR_32, i32, atomic_add_global>; + +defm GLOBAL_ATOMIC_SUB : FLAT_Global_Atomic_Pseudo <"global_atomic_sub", + VGPR_32, i32, atomic_sub_global>; + +defm GLOBAL_ATOMIC_SMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_smin", + VGPR_32, i32, atomic_min_global>; + +defm GLOBAL_ATOMIC_UMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_umin", + VGPR_32, i32, atomic_umin_global>; + +defm GLOBAL_ATOMIC_SMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_smax", + VGPR_32, i32, atomic_max_global>; + +defm GLOBAL_ATOMIC_UMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_umax", + VGPR_32, i32, atomic_umax_global>; + +defm GLOBAL_ATOMIC_AND : FLAT_Global_Atomic_Pseudo <"global_atomic_and", + VGPR_32, i32, atomic_and_global>; + +defm GLOBAL_ATOMIC_OR : FLAT_Global_Atomic_Pseudo <"global_atomic_or", + VGPR_32, i32, atomic_or_global>; + +defm GLOBAL_ATOMIC_XOR : FLAT_Global_Atomic_Pseudo <"global_atomic_xor", + VGPR_32, i32, atomic_xor_global>; + +defm GLOBAL_ATOMIC_INC : FLAT_Global_Atomic_Pseudo <"global_atomic_inc", + VGPR_32, i32, atomic_inc_global>; + +defm GLOBAL_ATOMIC_DEC : FLAT_Global_Atomic_Pseudo <"global_atomic_dec", + VGPR_32, i32, atomic_dec_global>; + +defm GLOBAL_ATOMIC_ADD_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_add_x2", + VReg_64, i64, atomic_add_global>; + +defm GLOBAL_ATOMIC_SUB_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_sub_x2", + VReg_64, i64, atomic_sub_global>; + +defm GLOBAL_ATOMIC_SMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smin_x2", + VReg_64, i64, atomic_min_global>; + +defm GLOBAL_ATOMIC_UMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umin_x2", + VReg_64, i64, atomic_umin_global>; + +defm GLOBAL_ATOMIC_SMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smax_x2", + VReg_64, i64, atomic_max_global>; + +defm GLOBAL_ATOMIC_UMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umax_x2", + VReg_64, i64, atomic_umax_global>; + +defm GLOBAL_ATOMIC_AND_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_and_x2", + VReg_64, i64, atomic_and_global>; + +defm GLOBAL_ATOMIC_OR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_or_x2", + VReg_64, i64, atomic_or_global>; + +defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_xor_x2", + VReg_64, i64, atomic_xor_global>; + +defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_inc_x2", + VReg_64, i64, atomic_inc_global>; + +defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2", + VReg_64, i64, atomic_dec_global>; +} // End is_flat_global = 1 } // End SubtargetPredicate = HasFlatGlobalInsts +let SubtargetPredicate = HasFlatScratchInsts in { +defm SCRATCH_LOAD_UBYTE : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte", VGPR_32>; +defm SCRATCH_LOAD_SBYTE : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte", VGPR_32>; +defm SCRATCH_LOAD_USHORT : FLAT_Scratch_Load_Pseudo <"scratch_load_ushort", VGPR_32>; +defm SCRATCH_LOAD_SSHORT : FLAT_Scratch_Load_Pseudo <"scratch_load_sshort", VGPR_32>; +defm SCRATCH_LOAD_DWORD : FLAT_Scratch_Load_Pseudo <"scratch_load_dword", VGPR_32>; +defm SCRATCH_LOAD_DWORDX2 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx2", VReg_64>; +defm SCRATCH_LOAD_DWORDX3 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx3", VReg_96>; +defm SCRATCH_LOAD_DWORDX4 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx4", VReg_128>; + +defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16", VGPR_32>; +defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16_hi", VGPR_32>; +defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16", VGPR_32>; +defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16_hi", VGPR_32>; +defm SCRATCH_LOAD_SHORT_D16 : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16", VGPR_32>; +defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16_hi", VGPR_32>; + +defm SCRATCH_STORE_BYTE : FLAT_Scratch_Store_Pseudo <"scratch_store_byte", VGPR_32>; +defm SCRATCH_STORE_SHORT : FLAT_Scratch_Store_Pseudo <"scratch_store_short", VGPR_32>; +defm SCRATCH_STORE_DWORD : FLAT_Scratch_Store_Pseudo <"scratch_store_dword", VGPR_32>; +defm SCRATCH_STORE_DWORDX2 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx2", VReg_64>; +defm SCRATCH_STORE_DWORDX3 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx3", VReg_96>; +defm SCRATCH_STORE_DWORDX4 : FLAT_Scratch_Store_Pseudo <"scratch_store_dwordx4", VReg_128>; + +defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_byte_d16_hi", VGPR_32>; +defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Scratch_Store_Pseudo <"scratch_store_short_d16_hi", VGPR_32>; + +} // End SubtargetPredicate = HasFlatScratchInsts + //===----------------------------------------------------------------------===// // Flat Patterns //===----------------------------------------------------------------------===// -class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr), - (ld node:$ptr), [{ - auto const AS = cast<MemSDNode>(N)->getAddressSpace(); - return AS == AMDGPUASI.FLAT_ADDRESS || - AS == AMDGPUASI.GLOBAL_ADDRESS || - AS == AMDGPUASI.CONSTANT_ADDRESS; -}]>; - -class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr), - (st node:$val, node:$ptr), [{ - auto const AS = cast<MemSDNode>(N)->getAddressSpace(); - return AS == AMDGPUASI.FLAT_ADDRESS || - AS == AMDGPUASI.GLOBAL_ADDRESS; -}]>; - -def atomic_flat_load : flat_ld <atomic_load>; -def flat_load : flat_ld <load>; -def flat_az_extloadi8 : flat_ld <az_extloadi8>; -def flat_sextloadi8 : flat_ld <sextloadi8>; -def flat_az_extloadi16 : flat_ld <az_extloadi16>; -def flat_sextloadi16 : flat_ld <sextloadi16>; - -def atomic_flat_store : flat_st <atomic_store>; -def flat_store : flat_st <store>; -def flat_truncstorei8 : flat_st <truncstorei8>; -def flat_truncstorei16 : flat_st <truncstorei16>; - // Patterns for global loads with no offset. -class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat < - (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))), +class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))), (inst $vaddr, $offset, 0, $slc) >; -class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat < +multiclass FlatLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> { + def : GCNPat < + (build_vector vt:$elt0, (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))), + (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0)) + >; + + def : GCNPat < + (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)))))), + (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0)) + >; +} + +multiclass FlatSignedLoadPat_Hi16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> { + def : GCNPat < + (build_vector vt:$elt0, (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))), + (v2i16 (inst $vaddr, $offset, 0, $slc, $elt0)) + >; + + def : GCNPat < + (build_vector f16:$elt0, (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)))))), + (v2f16 (inst $vaddr, $offset, 0, $slc, $elt0)) + >; +} + +multiclass FlatLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> { + def : GCNPat < + (build_vector (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))), + (v2i16 (inst $vaddr, $offset, 0, $slc, $hi)) + >; + + def : GCNPat < + (build_vector (f16 (bitconvert (vt (node (FLATOffset i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))), + (v2f16 (inst $vaddr, $offset, 0, $slc, $hi)) + >; +} + +multiclass FlatSignedLoadPat_Lo16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt = i16> { + def : GCNPat < + (build_vector (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))), (vt (Hi16Elt vt:$hi))), + (v2i16 (inst $vaddr, $offset, 0, $slc, $hi)) + >; + + def : GCNPat < + (build_vector (f16 (bitconvert (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))))), (f16 (Hi16Elt f16:$hi))), + (v2f16 (inst $vaddr, $offset, 0, $slc, $hi)) + >; +} + +class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))), - (inst $vaddr, $offset, 1, $slc) + (inst $vaddr, $offset, 0, $slc) >; -class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat < - (node vt:$data, (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc)), +class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc))), + (inst $vaddr, $offset, 0, $slc) +>; + +class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (node vt:$data, (FLATOffset i64:$vaddr, i16:$offset, i1:$slc)), (inst $vaddr, $data, $offset, 0, $slc) >; -class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat < +class FlatStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (node vt:$data, (FLATOffsetSigned i64:$vaddr, i16:$offset, i1:$slc)), + (inst $vaddr, $data, $offset, 0, $slc) +>; + +class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < // atomic store follows atomic binop convention so the address comes // first. (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data), - (inst $vaddr, $data, $offset, 1, $slc) + (inst $vaddr, $data, $offset, 0, $slc) +>; + +class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + // atomic store follows atomic binop convention so the address comes + // first. + (node (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data), + (inst $vaddr, $data, $offset, 0, $slc) >; class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, - ValueType data_vt = vt> : Pat < + ValueType data_vt = vt> : GCNPat < (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$data)), (inst $vaddr, $data, $offset, $slc) >; -let Predicates = [isCIVI] in { +class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt, + ValueType data_vt = vt> : GCNPat < + (vt (node (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$data)), + (inst $vaddr, $data, $offset, $slc) +>; + +let OtherPredicates = [HasFlatAddressSpace] in { -def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>; -def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>; -def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i16>; -def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i16>; -def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>; -def : FlatLoadPat <FLAT_LOAD_SSHORT, flat_sextloadi16, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX2, flat_load, v2i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX4, flat_load, v4i32>; +def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_flat, i32>; +def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>; +def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_flat, i16>; +def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>; +def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_flat, i32>; +def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>; +def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, v2i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, v4i32>; -def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_flat_load, i32>; -def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_flat_load, i64>; +def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_load_flat, i32>; +def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_load_flat, i64>; -def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i32>; -def : FlatStorePat <FLAT_STORE_SHORT, flat_truncstorei16, i32>; -def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>; -def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>; -def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>; +def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>; +def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>; +def : FlatStorePat <FLAT_STORE_DWORD, store_flat, i32>; +def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, v2i32>; +def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, v4i32>; -def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_flat_store, i32>; -def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_flat_store, i64>; +def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat, i32>; +def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat, i64>; def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>; def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>; @@ -465,13 +777,100 @@ def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>; def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>; def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>; -} // End Predicates = [isCIVI] +def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>; +def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>; -let Predicates = [isVI] in { - def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i16>; - def : FlatStorePat <FLAT_STORE_SHORT, flat_store, i16>; +let OtherPredicates = [HasD16LoadStore] in { +def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>; +def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>; + +let AddedComplexity = 3 in { +defm : FlatLoadPat_Hi16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_flat>; +defm : FlatLoadPat_Hi16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_flat>; +defm : FlatLoadPat_Hi16 <FLAT_LOAD_SHORT_D16_HI, load_flat>; } +let AddedComplexity = 9 in { +defm : FlatLoadPat_Lo16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_flat>; +defm : FlatLoadPat_Lo16 <FLAT_LOAD_SBYTE_D16, sextloadi8_flat>; +defm : FlatLoadPat_Lo16 <FLAT_LOAD_SHORT_D16, load_flat>; +} +} + +} // End OtherPredicates = [HasFlatAddressSpace] + +let OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 in { + +def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, az_extloadi8_global, i32>; +def : FlatLoadSignedPat <GLOBAL_LOAD_SBYTE, sextloadi8_global, i32>; +def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, az_extloadi8_global, i16>; +def : FlatLoadSignedPat <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>; +def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, az_extloadi16_global, i32>; +def : FlatLoadSignedPat <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>; +def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, load_global, i16>; + +def : FlatLoadSignedPat <GLOBAL_LOAD_DWORD, load_global, i32>; +def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX2, load_global, v2i32>; +def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX4, load_global, v4i32>; + +def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORD, atomic_load_global, i32>; +def : FlatLoadAtomicPat <GLOBAL_LOAD_DWORDX2, atomic_load_global, i64>; + +def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i32>; +def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i16>; +def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, truncstorei16_global, i32>; +def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, store_global, i16>; +def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32>; +def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32>; +def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32>; + +let OtherPredicates = [HasD16LoadStore] in { +def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>; +def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>; + +defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_UBYTE_D16_HI, az_extloadi8_global>; +defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SBYTE_D16_HI, sextloadi8_global>; +defm : FlatSignedLoadPat_Hi16 <GLOBAL_LOAD_SHORT_D16_HI, load_global>; + +defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_UBYTE_D16, az_extloadi8_global>; +defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SBYTE_D16, sextloadi8_global>; +defm : FlatSignedLoadPat_Lo16 <GLOBAL_LOAD_SHORT_D16, load_global>; + +} + +def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, store_atomic_global, i32>; +def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORDX2, store_atomic_global, i64>; + +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_RTN, atomic_add_global, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_RTN, atomic_sub_global, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_RTN, atomic_inc_global, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_RTN, atomic_dec_global, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_RTN, atomic_and_global, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_RTN, atomic_max_global, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_RTN, atomic_umax_global, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_RTN, atomic_min_global, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_RTN, atomic_umin_global, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_RTN, atomic_or_global, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global, i32, v2i32>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_RTN, atomic_xor_global, i32>; + +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_X2_RTN, atomic_and_global, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_X2_RTN, atomic_or_global, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>; +def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>; + +} // End OtherPredicates = [HasFlatGlobalInsts] + //===----------------------------------------------------------------------===// // Target @@ -556,6 +955,11 @@ class FLAT_Real_vi <bits<7> op, FLAT_Pseudo ps> : let DecoderNamespace="VI"; } +multiclass FLAT_Real_AllAddr_vi<bits<7> op> { + def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME)>; + def _SADDR_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(NAME#"_SADDR")>; +} + def FLAT_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>; def FLAT_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>; def FLAT_LOAD_USHORT_vi : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>; @@ -566,17 +970,33 @@ def FLAT_LOAD_DWORDX4_vi : FLAT_Real_vi <0x17, FLAT_LOAD_DWORDX4>; def FLAT_LOAD_DWORDX3_vi : FLAT_Real_vi <0x16, FLAT_LOAD_DWORDX3>; def FLAT_STORE_BYTE_vi : FLAT_Real_vi <0x18, FLAT_STORE_BYTE>; +def FLAT_STORE_BYTE_D16_HI_vi : FLAT_Real_vi <0x19, FLAT_STORE_BYTE_D16_HI>; def FLAT_STORE_SHORT_vi : FLAT_Real_vi <0x1a, FLAT_STORE_SHORT>; +def FLAT_STORE_SHORT_D16_HI_vi : FLAT_Real_vi <0x1b, FLAT_STORE_SHORT_D16_HI>; def FLAT_STORE_DWORD_vi : FLAT_Real_vi <0x1c, FLAT_STORE_DWORD>; def FLAT_STORE_DWORDX2_vi : FLAT_Real_vi <0x1d, FLAT_STORE_DWORDX2>; def FLAT_STORE_DWORDX4_vi : FLAT_Real_vi <0x1f, FLAT_STORE_DWORDX4>; def FLAT_STORE_DWORDX3_vi : FLAT_Real_vi <0x1e, FLAT_STORE_DWORDX3>; +def FLAT_LOAD_UBYTE_D16_vi : FLAT_Real_vi <0x20, FLAT_LOAD_UBYTE_D16>; +def FLAT_LOAD_UBYTE_D16_HI_vi : FLAT_Real_vi <0x21, FLAT_LOAD_UBYTE_D16_HI>; +def FLAT_LOAD_SBYTE_D16_vi : FLAT_Real_vi <0x22, FLAT_LOAD_SBYTE_D16>; +def FLAT_LOAD_SBYTE_D16_HI_vi : FLAT_Real_vi <0x23, FLAT_LOAD_SBYTE_D16_HI>; +def FLAT_LOAD_SHORT_D16_vi : FLAT_Real_vi <0x24, FLAT_LOAD_SHORT_D16>; +def FLAT_LOAD_SHORT_D16_HI_vi : FLAT_Real_vi <0x25, FLAT_LOAD_SHORT_D16_HI>; + multiclass FLAT_Real_Atomics_vi <bits<7> op, FLAT_Pseudo ps> { def _vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr)>; def _RTN_vi : FLAT_Real_vi<op, !cast<FLAT_Pseudo>(ps.PseudoInstr # "_RTN")>; } +multiclass FLAT_Global_Real_Atomics_vi<bits<7> op> : + FLAT_Real_AllAddr_vi<op> { + def _RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_RTN")>; + def _SADDR_RTN_vi : FLAT_Real_vi <op, !cast<FLAT_Pseudo>(NAME#"_SADDR_RTN")>; +} + + defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_vi <0x40, FLAT_ATOMIC_SWAP>; defm FLAT_ATOMIC_CMPSWAP : FLAT_Real_Atomics_vi <0x41, FLAT_ATOMIC_CMPSWAP>; defm FLAT_ATOMIC_ADD : FLAT_Real_Atomics_vi <0x42, FLAT_ATOMIC_ADD>; @@ -604,18 +1024,78 @@ defm FLAT_ATOMIC_XOR_X2 : FLAT_Real_Atomics_vi <0x6a, FLAT_ATOMIC_XOR_X2>; defm FLAT_ATOMIC_INC_X2 : FLAT_Real_Atomics_vi <0x6b, FLAT_ATOMIC_INC_X2>; defm FLAT_ATOMIC_DEC_X2 : FLAT_Real_Atomics_vi <0x6c, FLAT_ATOMIC_DEC_X2>; -def GLOBAL_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, GLOBAL_LOAD_UBYTE>; -def GLOBAL_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, GLOBAL_LOAD_SBYTE>; -def GLOBAL_LOAD_USHORT_vi : FLAT_Real_vi <0x12, GLOBAL_LOAD_USHORT>; -def GLOBAL_LOAD_SSHORT_vi : FLAT_Real_vi <0x13, GLOBAL_LOAD_SSHORT>; -def GLOBAL_LOAD_DWORD_vi : FLAT_Real_vi <0x14, GLOBAL_LOAD_DWORD>; -def GLOBAL_LOAD_DWORDX2_vi : FLAT_Real_vi <0x15, GLOBAL_LOAD_DWORDX2>; -def GLOBAL_LOAD_DWORDX4_vi : FLAT_Real_vi <0x17, GLOBAL_LOAD_DWORDX4>; -def GLOBAL_LOAD_DWORDX3_vi : FLAT_Real_vi <0x16, GLOBAL_LOAD_DWORDX3>; - -def GLOBAL_STORE_BYTE_vi : FLAT_Real_vi <0x18, GLOBAL_STORE_BYTE>; -def GLOBAL_STORE_SHORT_vi : FLAT_Real_vi <0x1a, GLOBAL_STORE_SHORT>; -def GLOBAL_STORE_DWORD_vi : FLAT_Real_vi <0x1c, GLOBAL_STORE_DWORD>; -def GLOBAL_STORE_DWORDX2_vi : FLAT_Real_vi <0x1d, GLOBAL_STORE_DWORDX2>; -def GLOBAL_STORE_DWORDX4_vi : FLAT_Real_vi <0x1f, GLOBAL_STORE_DWORDX4>; -def GLOBAL_STORE_DWORDX3_vi : FLAT_Real_vi <0x1e, GLOBAL_STORE_DWORDX3>; +defm GLOBAL_LOAD_UBYTE : FLAT_Real_AllAddr_vi <0x10>; +defm GLOBAL_LOAD_SBYTE : FLAT_Real_AllAddr_vi <0x11>; +defm GLOBAL_LOAD_USHORT : FLAT_Real_AllAddr_vi <0x12>; +defm GLOBAL_LOAD_SSHORT : FLAT_Real_AllAddr_vi <0x13>; +defm GLOBAL_LOAD_DWORD : FLAT_Real_AllAddr_vi <0x14>; +defm GLOBAL_LOAD_DWORDX2 : FLAT_Real_AllAddr_vi <0x15>; +defm GLOBAL_LOAD_DWORDX3 : FLAT_Real_AllAddr_vi <0x16>; +defm GLOBAL_LOAD_DWORDX4 : FLAT_Real_AllAddr_vi <0x17>; + +defm GLOBAL_LOAD_UBYTE_D16 : FLAT_Real_AllAddr_vi <0x20>; +defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Real_AllAddr_vi <0x21>; +defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Real_AllAddr_vi <0x22>; +defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Real_AllAddr_vi <0x23>; +defm GLOBAL_LOAD_SHORT_D16 : FLAT_Real_AllAddr_vi <0x24>; +defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Real_AllAddr_vi <0x25>; + +defm GLOBAL_STORE_BYTE : FLAT_Real_AllAddr_vi <0x18>; +defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Real_AllAddr_vi <0x19>; +defm GLOBAL_STORE_SHORT : FLAT_Real_AllAddr_vi <0x1a>; +defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_vi <0x1b>; +defm GLOBAL_STORE_DWORD : FLAT_Real_AllAddr_vi <0x1c>; +defm GLOBAL_STORE_DWORDX2 : FLAT_Real_AllAddr_vi <0x1d>; +defm GLOBAL_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>; +defm GLOBAL_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>; + + +defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Real_Atomics_vi <0x40>; +defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Real_Atomics_vi <0x41>; +defm GLOBAL_ATOMIC_ADD : FLAT_Global_Real_Atomics_vi <0x42>; +defm GLOBAL_ATOMIC_SUB : FLAT_Global_Real_Atomics_vi <0x43>; +defm GLOBAL_ATOMIC_SMIN : FLAT_Global_Real_Atomics_vi <0x44>; +defm GLOBAL_ATOMIC_UMIN : FLAT_Global_Real_Atomics_vi <0x45>; +defm GLOBAL_ATOMIC_SMAX : FLAT_Global_Real_Atomics_vi <0x46>; +defm GLOBAL_ATOMIC_UMAX : FLAT_Global_Real_Atomics_vi <0x47>; +defm GLOBAL_ATOMIC_AND : FLAT_Global_Real_Atomics_vi <0x48>; +defm GLOBAL_ATOMIC_OR : FLAT_Global_Real_Atomics_vi <0x49>; +defm GLOBAL_ATOMIC_XOR : FLAT_Global_Real_Atomics_vi <0x4a>; +defm GLOBAL_ATOMIC_INC : FLAT_Global_Real_Atomics_vi <0x4b>; +defm GLOBAL_ATOMIC_DEC : FLAT_Global_Real_Atomics_vi <0x4c>; +defm GLOBAL_ATOMIC_SWAP_X2 : FLAT_Global_Real_Atomics_vi <0x60>; +defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Global_Real_Atomics_vi <0x61>; +defm GLOBAL_ATOMIC_ADD_X2 : FLAT_Global_Real_Atomics_vi <0x62>; +defm GLOBAL_ATOMIC_SUB_X2 : FLAT_Global_Real_Atomics_vi <0x63>; +defm GLOBAL_ATOMIC_SMIN_X2 : FLAT_Global_Real_Atomics_vi <0x64>; +defm GLOBAL_ATOMIC_UMIN_X2 : FLAT_Global_Real_Atomics_vi <0x65>; +defm GLOBAL_ATOMIC_SMAX_X2 : FLAT_Global_Real_Atomics_vi <0x66>; +defm GLOBAL_ATOMIC_UMAX_X2 : FLAT_Global_Real_Atomics_vi <0x67>; +defm GLOBAL_ATOMIC_AND_X2 : FLAT_Global_Real_Atomics_vi <0x68>; +defm GLOBAL_ATOMIC_OR_X2 : FLAT_Global_Real_Atomics_vi <0x69>; +defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Global_Real_Atomics_vi <0x6a>; +defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Real_Atomics_vi <0x6b>; +defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Real_Atomics_vi <0x6c>; + +defm SCRATCH_LOAD_UBYTE : FLAT_Real_AllAddr_vi <0x10>; +defm SCRATCH_LOAD_SBYTE : FLAT_Real_AllAddr_vi <0x11>; +defm SCRATCH_LOAD_USHORT : FLAT_Real_AllAddr_vi <0x12>; +defm SCRATCH_LOAD_SSHORT : FLAT_Real_AllAddr_vi <0x13>; +defm SCRATCH_LOAD_DWORD : FLAT_Real_AllAddr_vi <0x14>; +defm SCRATCH_LOAD_DWORDX2 : FLAT_Real_AllAddr_vi <0x15>; +defm SCRATCH_LOAD_DWORDX3 : FLAT_Real_AllAddr_vi <0x16>; +defm SCRATCH_LOAD_DWORDX4 : FLAT_Real_AllAddr_vi <0x17>; +defm SCRATCH_STORE_BYTE : FLAT_Real_AllAddr_vi <0x18>; +defm SCRATCH_STORE_BYTE_D16_HI : FLAT_Real_AllAddr_vi <0x19>; +defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Real_AllAddr_vi <0x20>; +defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Real_AllAddr_vi <0x21>; +defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Real_AllAddr_vi <0x22>; +defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Real_AllAddr_vi <0x23>; +defm SCRATCH_LOAD_SHORT_D16 : FLAT_Real_AllAddr_vi <0x24>; +defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Real_AllAddr_vi <0x25>; +defm SCRATCH_STORE_SHORT : FLAT_Real_AllAddr_vi <0x1a>; +defm SCRATCH_STORE_SHORT_D16_HI : FLAT_Real_AllAddr_vi <0x1b>; +defm SCRATCH_STORE_DWORD : FLAT_Real_AllAddr_vi <0x1c>; +defm SCRATCH_STORE_DWORDX2 : FLAT_Real_AllAddr_vi <0x1d>; +defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>; +defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>; diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 025397b1eac0..dd515b0bf2f1 100644 --- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -40,7 +40,10 @@ GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : CurrCycleInstr(nullptr), MF(MF), ST(MF.getSubtarget<SISubtarget>()), - TII(*ST.getInstrInfo()) { + TII(*ST.getInstrInfo()), + TRI(TII.getRegisterInfo()), + ClauseUses(TRI.getNumRegUnits()), + ClauseDefs(TRI.getNumRegUnits()) { MaxLookAhead = 5; } @@ -84,6 +87,18 @@ static bool isSMovRel(unsigned Opcode) { } } +static bool isSendMsgTraceDataOrGDS(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AMDGPU::S_SENDMSG: + case AMDGPU::S_SENDMSGHALT: + case AMDGPU::S_TTRACEDATA: + return true; + default: + // TODO: GDS + return false; + } +} + static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) { const MachineOperand *RegOp = TII->getNamedOperand(RegInstr, AMDGPU::OpName::simm16); @@ -97,7 +112,10 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) return NoopHazard; - if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0) + // FIXME: Should flat be considered vmem? + if ((SIInstrInfo::isVMEM(*MI) || + SIInstrInfo::isFLAT(*MI)) + && checkVMEMHazards(MI) > 0) return NoopHazard; if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) @@ -121,10 +139,18 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) return NoopHazard; - if ((TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) && + if (ST.hasReadM0MovRelInterpHazard() && + (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) && checkReadM0Hazards(MI) > 0) return NoopHazard; + if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI) && + checkReadM0Hazards(MI) > 0) + return NoopHazard; + + if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) + return NoopHazard; + if (checkAnyInstHazards(MI) > 0) return NoopHazard; @@ -141,26 +167,23 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { if (SIInstrInfo::isSMRD(*MI)) return std::max(WaitStates, checkSMRDHazards(MI)); - if (SIInstrInfo::isVALU(*MI)) { - WaitStates = std::max(WaitStates, checkVALUHazards(MI)); - - if (SIInstrInfo::isVMEM(*MI)) - WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); + if (SIInstrInfo::isVALU(*MI)) + WaitStates = std::max(WaitStates, checkVALUHazards(MI)); - if (SIInstrInfo::isDPP(*MI)) - WaitStates = std::max(WaitStates, checkDPPHazards(MI)); + if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) + WaitStates = std::max(WaitStates, checkVMEMHazards(MI)); - if (isDivFMas(MI->getOpcode())) - WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); + if (SIInstrInfo::isDPP(*MI)) + WaitStates = std::max(WaitStates, checkDPPHazards(MI)); - if (isRWLane(MI->getOpcode())) - WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); + if (isDivFMas(MI->getOpcode())) + WaitStates = std::max(WaitStates, checkDivFMasHazards(MI)); - if (TII.isVINTRP(*MI)) - WaitStates = std::max(WaitStates, checkReadM0Hazards(MI)); + if (isRWLane(MI->getOpcode())) + WaitStates = std::max(WaitStates, checkRWLaneHazards(MI)); - return WaitStates; - } + if (MI->isInlineAsm()) + return std::max(WaitStates, checkInlineAsmHazards(MI)); if (isSGetReg(MI->getOpcode())) return std::max(WaitStates, checkGetRegHazards(MI)); @@ -171,7 +194,11 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { if (isRFE(MI->getOpcode())) return std::max(WaitStates, checkRFEHazards(MI)); - if (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) + if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) || + isSMovRel(MI->getOpcode()))) + return std::max(WaitStates, checkReadM0Hazards(MI)); + + if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI)) return std::max(WaitStates, checkReadM0Hazards(MI)); return WaitStates; @@ -225,7 +252,8 @@ int GCNHazardRecognizer::getWaitStatesSince( return WaitStates; unsigned Opcode = MI->getOpcode(); - if (Opcode == AMDGPU::DBG_VALUE || Opcode == AMDGPU::IMPLICIT_DEF) + if (Opcode == AMDGPU::DBG_VALUE || Opcode == AMDGPU::IMPLICIT_DEF || + Opcode == AMDGPU::INLINEASM) continue; } ++WaitStates; @@ -257,19 +285,37 @@ int GCNHazardRecognizer::getWaitStatesSinceSetReg( // No-op Hazard Detection //===----------------------------------------------------------------------===// -static void addRegsToSet(iterator_range<MachineInstr::const_mop_iterator> Ops, - std::set<unsigned> &Set) { +static void addRegUnits(const SIRegisterInfo &TRI, + BitVector &BV, unsigned Reg) { + for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) + BV.set(*RUI); +} + +static void addRegsToSet(const SIRegisterInfo &TRI, + iterator_range<MachineInstr::const_mop_iterator> Ops, + BitVector &Set) { for (const MachineOperand &Op : Ops) { if (Op.isReg()) - Set.insert(Op.getReg()); + addRegUnits(TRI, Set, Op.getReg()); } } -int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) { - // SMEM soft clause are only present on VI+ - if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) +void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) { + // XXX: Do we need to worry about implicit operands + addRegsToSet(TRI, MI.defs(), ClauseDefs); + addRegsToSet(TRI, MI.uses(), ClauseUses); +} + +int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) { + // SMEM soft clause are only present on VI+, and only matter if xnack is + // enabled. + if (!ST.isXNACKEnabled()) return 0; + bool IsSMRD = TII.isSMRD(*MEM); + + resetClause(); + // A soft-clause is any group of consecutive SMEM instructions. The // instructions in this group may return out of order and/or may be // replayed (i.e. the same instruction issued more than once). @@ -280,51 +326,39 @@ int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) { // (including itself). If we encounter this situaion, we need to break the // clause by inserting a non SMEM instruction. - std::set<unsigned> ClauseDefs; - std::set<unsigned> ClauseUses; - for (MachineInstr *MI : EmittedInstrs) { - // When we hit a non-SMEM instruction then we have passed the start of the // clause and we can stop. - if (!MI || !SIInstrInfo::isSMRD(*MI)) + if (!MI) break; - addRegsToSet(MI->defs(), ClauseDefs); - addRegsToSet(MI->uses(), ClauseUses); + if (IsSMRD != SIInstrInfo::isSMRD(*MI)) + break; + + addClauseInst(*MI); } - if (ClauseDefs.empty()) + if (ClauseDefs.none()) return 0; - // FIXME: When we support stores, we need to make sure not to put loads and - // stores in the same clause if they use the same address. For now, just - // start a new clause whenever we see a store. - if (SMEM->mayStore()) + // We need to make sure not to put loads and stores in the same clause if they + // use the same address. For now, just start a new clause whenever we see a + // store. + if (MEM->mayStore()) return 1; - addRegsToSet(SMEM->defs(), ClauseDefs); - addRegsToSet(SMEM->uses(), ClauseUses); - - std::vector<unsigned> Result(std::max(ClauseDefs.size(), ClauseUses.size())); - std::vector<unsigned>::iterator End; - - End = std::set_intersection(ClauseDefs.begin(), ClauseDefs.end(), - ClauseUses.begin(), ClauseUses.end(), Result.begin()); + addClauseInst(*MEM); // If the set of defs and uses intersect then we cannot add this instruction // to the clause, so we have a hazard. - if (End != Result.begin()) - return 1; - - return 0; + return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0; } int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); int WaitStatesNeeded = 0; - WaitStatesNeeded = checkSMEMSoftClauseHazards(SMRD); + WaitStatesNeeded = checkSoftClauseHazards(SMRD); // This SMRD hazard only affects SI. if (ST.getGeneration() != SISubtarget::SOUTHERN_ISLANDS) @@ -334,6 +368,9 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { // SGPR was written by a VALU instruction. int SmrdSgprWaitStates = 4; auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; + auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); }; + + bool IsBufferSMRD = TII.isBufferSMRD(*SMRD); for (const MachineOperand &Use : SMRD->uses()) { if (!Use.isReg()) @@ -341,23 +378,35 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { int WaitStatesNeededForUse = SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn); WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + + // This fixes what appears to be undocumented hardware behavior in SI where + // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor + // needs some number of nops in between. We don't know how many we need, but + // let's use 4. This wasn't discovered before probably because the only + // case when this happens is when we expand a 64-bit pointer into a full + // descriptor and use s_buffer_load_dword instead of s_load_dword, which was + // probably never encountered in the closed-source land. + if (IsBufferSMRD) { + int WaitStatesNeededForUse = + SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), + IsBufferHazardDefFn); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } } + return WaitStatesNeeded; } int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { - const SIInstrInfo *TII = ST.getInstrInfo(); - if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) return 0; - const SIRegisterInfo &TRI = TII->getRegisterInfo(); + int WaitStatesNeeded = checkSoftClauseHazards(VMEM); // A read of an SGPR by a VMEM instruction requires 5 wait states when the // SGPR was written by a VALU Instruction. - int VmemSgprWaitStates = 5; - int WaitStatesNeeded = 0; - auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; + const int VmemSgprWaitStates = 5; + auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); }; for (const MachineOperand &Use : VMEM->uses()) { if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())) @@ -372,10 +421,13 @@ int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); - // Check for DPP VGPR read after VALU VGPR write. + // Check for DPP VGPR read after VALU VGPR write and EXEC write. int DppVgprWaitStates = 2; + int DppExecWaitStates = 5; int WaitStatesNeeded = 0; + auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; for (const MachineOperand &Use : DPP->uses()) { if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) @@ -385,6 +437,10 @@ int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); } + WaitStatesNeeded = std::max( + WaitStatesNeeded, + DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn)); + return WaitStatesNeeded; } @@ -475,39 +531,76 @@ int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) { return -1; } +int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def, + const MachineRegisterInfo &MRI) { + // Helper to check for the hazard where VMEM instructions that store more than + // 8 bytes can have there store data over written by the next instruction. + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + const int VALUWaitStates = 1; + int WaitStatesNeeded = 0; + + if (!TRI->isVGPR(MRI, Def.getReg())) + return WaitStatesNeeded; + unsigned Reg = Def.getReg(); + auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) { + int DataIdx = createsVALUHazard(*MI); + return DataIdx >= 0 && + TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg); + }; + int WaitStatesNeededForDef = + VALUWaitStates - getWaitStatesSince(IsHazardFn); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); + + return WaitStatesNeeded; +} + int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) { // This checks for the hazard where VMEM instructions that store more than // 8 bytes can have there store data over written by the next instruction. if (!ST.has12DWordStoreHazard()) return 0; - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const MachineRegisterInfo &MRI = VALU->getParent()->getParent()->getRegInfo(); - - const int VALUWaitStates = 1; + const MachineRegisterInfo &MRI = MF.getRegInfo(); int WaitStatesNeeded = 0; for (const MachineOperand &Def : VALU->defs()) { - if (!TRI->isVGPR(MRI, Def.getReg())) - continue; - unsigned Reg = Def.getReg(); - auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) { - int DataIdx = createsVALUHazard(*MI); - return DataIdx >= 0 && - TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg); - }; - int WaitStatesNeededForDef = - VALUWaitStates - getWaitStatesSince(IsHazardFn); - WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef); + WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI)); + } + + return WaitStatesNeeded; +} + +int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) { + // This checks for hazards associated with inline asm statements. + // Since inline asms can contain just about anything, we use this + // to call/leverage other check*Hazard routines. Note that + // this function doesn't attempt to address all possible inline asm + // hazards (good luck), but is a collection of what has been + // problematic thus far. + + // see checkVALUHazards() + if (!ST.has12DWordStoreHazard()) + return 0; + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + int WaitStatesNeeded = 0; + + for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands(); + I != E; ++I) { + const MachineOperand &Op = IA->getOperand(I); + if (Op.isReg() && Op.isDef()) { + WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI)); + } } + return WaitStatesNeeded; } int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) { const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const MachineRegisterInfo &MRI = - RWLane->getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); const MachineOperand *LaneSelectOp = TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1); @@ -568,11 +661,8 @@ int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) { } int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) { - if (!ST.hasReadM0Hazard()) - return 0; - const SIInstrInfo *TII = ST.getInstrInfo(); - int SMovRelWaitStates = 1; + const int SMovRelWaitStates = 1; auto IsHazardFn = [TII] (MachineInstr *MI) { return TII->isSALU(*MI); }; diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.h b/lib/Target/AMDGPU/GCNHazardRecognizer.h index 5680c3de6a1a..f9a6e395a454 100644 --- a/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -14,6 +14,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H #define LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/ScheduleHazardRecognizer.h" #include <list> @@ -22,8 +23,11 @@ namespace llvm { class MachineFunction; class MachineInstr; +class MachineOperand; +class MachineRegisterInfo; class ScheduleDAG; class SIInstrInfo; +class SIRegisterInfo; class SISubtarget; class GCNHazardRecognizer final : public ScheduleHazardRecognizer { @@ -35,6 +39,20 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { const MachineFunction &MF; const SISubtarget &ST; const SIInstrInfo &TII; + const SIRegisterInfo &TRI; + + /// RegUnits of uses in the current soft memory clause. + BitVector ClauseUses; + + /// RegUnits of defs in the current soft memory clause. + BitVector ClauseDefs; + + void resetClause() { + ClauseUses.reset(); + ClauseDefs.reset(); + } + + void addClauseInst(const MachineInstr &MI); int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard); int getWaitStatesSinceDef(unsigned Reg, @@ -42,7 +60,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { [](MachineInstr *) { return true; }); int getWaitStatesSinceSetReg(function_ref<bool(MachineInstr *)> IsHazard); - int checkSMEMSoftClauseHazards(MachineInstr *SMEM); + int checkSoftClauseHazards(MachineInstr *SMEM); int checkSMRDHazards(MachineInstr *SMRD); int checkVMEMHazards(MachineInstr* VMEM); int checkDPPHazards(MachineInstr *DPP); @@ -51,8 +69,10 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { int checkSetRegHazards(MachineInstr *SetRegInstr); int createsVALUHazard(const MachineInstr &MI); int checkVALUHazards(MachineInstr *VALU); + int checkVALUHazardsHelper(const MachineOperand &Def, const MachineRegisterInfo &MRI); int checkRWLaneHazards(MachineInstr *RWLane); int checkRFEHazards(MachineInstr *RFE); + int checkInlineAsmHazards(MachineInstr *IA); int checkAnyInstHazards(MachineInstr *MI); int checkReadM0Hazards(MachineInstr *SMovRel); public: diff --git a/lib/Target/AMDGPU/GCNILPSched.cpp b/lib/Target/AMDGPU/GCNILPSched.cpp new file mode 100644 index 000000000000..ba8211b189cf --- /dev/null +++ b/lib/Target/AMDGPU/GCNILPSched.cpp @@ -0,0 +1,364 @@ +//===---------------------------- GCNILPSched.cpp - -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/ScheduleDAG.h" + +using namespace llvm; + +#define DEBUG_TYPE "machine-scheduler" + +namespace { + +class GCNILPScheduler { + struct Candidate : ilist_node<Candidate> { + SUnit *SU; + + Candidate(SUnit *SU_) + : SU(SU_) {} + }; + + SpecificBumpPtrAllocator<Candidate> Alloc; + typedef simple_ilist<Candidate> Queue; + Queue PendingQueue; + Queue AvailQueue; + unsigned CurQueueId = 0; + + std::vector<unsigned> SUNumbers; + + /// CurCycle - The current scheduler state corresponds to this cycle. + unsigned CurCycle = 0; + + unsigned getNodePriority(const SUnit *SU) const; + + const SUnit *pickBest(const SUnit *left, const SUnit *right); + Candidate* pickCandidate(); + + void releasePending(); + void advanceToCycle(unsigned NextCycle); + void releasePredecessors(const SUnit* SU); + +public: + std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots, + const ScheduleDAG &DAG); +}; +} // namespace + +/// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number. +/// Smaller number is the higher priority. +static unsigned +CalcNodeSethiUllmanNumber(const SUnit *SU, std::vector<unsigned> &SUNumbers) { + unsigned &SethiUllmanNumber = SUNumbers[SU->NodeNum]; + if (SethiUllmanNumber != 0) + return SethiUllmanNumber; + + unsigned Extra = 0; + for (const SDep &Pred : SU->Preds) { + if (Pred.isCtrl()) continue; // ignore chain preds + SUnit *PredSU = Pred.getSUnit(); + unsigned PredSethiUllman = CalcNodeSethiUllmanNumber(PredSU, SUNumbers); + if (PredSethiUllman > SethiUllmanNumber) { + SethiUllmanNumber = PredSethiUllman; + Extra = 0; + } + else if (PredSethiUllman == SethiUllmanNumber) + ++Extra; + } + + SethiUllmanNumber += Extra; + + if (SethiUllmanNumber == 0) + SethiUllmanNumber = 1; + + return SethiUllmanNumber; +} + +// Lower priority means schedule further down. For bottom-up scheduling, lower +// priority SUs are scheduled before higher priority SUs. +unsigned GCNILPScheduler::getNodePriority(const SUnit *SU) const { + assert(SU->NodeNum < SUNumbers.size()); + if (SU->NumSuccs == 0 && SU->NumPreds != 0) + // If SU does not have a register use, i.e. it doesn't produce a value + // that would be consumed (e.g. store), then it terminates a chain of + // computation. Give it a large SethiUllman number so it will be + // scheduled right before its predecessors that it doesn't lengthen + // their live ranges. + return 0xffff; + + if (SU->NumPreds == 0 && SU->NumSuccs != 0) + // If SU does not have a register def, schedule it close to its uses + // because it does not lengthen any live ranges. + return 0; + + return SUNumbers[SU->NodeNum]; +} + +/// closestSucc - Returns the scheduled cycle of the successor which is +/// closest to the current cycle. +static unsigned closestSucc(const SUnit *SU) { + unsigned MaxHeight = 0; + for (const SDep &Succ : SU->Succs) { + if (Succ.isCtrl()) continue; // ignore chain succs + unsigned Height = Succ.getSUnit()->getHeight(); + // If there are bunch of CopyToRegs stacked up, they should be considered + // to be at the same position. + if (Height > MaxHeight) + MaxHeight = Height; + } + return MaxHeight; +} + +/// calcMaxScratches - Returns an cost estimate of the worse case requirement +/// for scratch registers, i.e. number of data dependencies. +static unsigned calcMaxScratches(const SUnit *SU) { + unsigned Scratches = 0; + for (const SDep &Pred : SU->Preds) { + if (Pred.isCtrl()) continue; // ignore chain preds + Scratches++; + } + return Scratches; +} + +// Return -1 if left has higher priority, 1 if right has higher priority. +// Return 0 if latency-based priority is equivalent. +static int BUCompareLatency(const SUnit *left, const SUnit *right) { + // Scheduling an instruction that uses a VReg whose postincrement has not yet + // been scheduled will induce a copy. Model this as an extra cycle of latency. + int LHeight = (int)left->getHeight(); + int RHeight = (int)right->getHeight(); + + // If either node is scheduling for latency, sort them by height/depth + // and latency. + + // If neither instruction stalls (!LStall && !RStall) and HazardRecognizer + // is enabled, grouping instructions by cycle, then its height is already + // covered so only its depth matters. We also reach this point if both stall + // but have the same height. + if (LHeight != RHeight) + return LHeight > RHeight ? 1 : -1; + + int LDepth = left->getDepth(); + int RDepth = right->getDepth(); + if (LDepth != RDepth) { + DEBUG(dbgs() << " Comparing latency of SU (" << left->NodeNum + << ") depth " << LDepth << " vs SU (" << right->NodeNum + << ") depth " << RDepth << "\n"); + return LDepth < RDepth ? 1 : -1; + } + if (left->Latency != right->Latency) + return left->Latency > right->Latency ? 1 : -1; + + return 0; +} + +const SUnit *GCNILPScheduler::pickBest(const SUnit *left, const SUnit *right) +{ + // TODO: add register pressure lowering checks + + bool const DisableSchedCriticalPath = false; + int MaxReorderWindow = 6; + if (!DisableSchedCriticalPath) { + int spread = (int)left->getDepth() - (int)right->getDepth(); + if (std::abs(spread) > MaxReorderWindow) { + DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): " + << left->getDepth() << " != SU(" << right->NodeNum << "): " + << right->getDepth() << "\n"); + return left->getDepth() < right->getDepth() ? right : left; + } + } + + bool const DisableSchedHeight = false; + if (!DisableSchedHeight && left->getHeight() != right->getHeight()) { + int spread = (int)left->getHeight() - (int)right->getHeight(); + if (std::abs(spread) > MaxReorderWindow) + return left->getHeight() > right->getHeight() ? right : left; + } + + // Prioritize by Sethi-Ulmann number and push CopyToReg nodes down. + unsigned LPriority = getNodePriority(left); + unsigned RPriority = getNodePriority(right); + + if (LPriority != RPriority) + return LPriority > RPriority ? right : left; + + // Try schedule def + use closer when Sethi-Ullman numbers are the same. + // e.g. + // t1 = op t2, c1 + // t3 = op t4, c2 + // + // and the following instructions are both ready. + // t2 = op c3 + // t4 = op c4 + // + // Then schedule t2 = op first. + // i.e. + // t4 = op c4 + // t2 = op c3 + // t1 = op t2, c1 + // t3 = op t4, c2 + // + // This creates more short live intervals. + unsigned LDist = closestSucc(left); + unsigned RDist = closestSucc(right); + if (LDist != RDist) + return LDist < RDist ? right : left; + + // How many registers becomes live when the node is scheduled. + unsigned LScratch = calcMaxScratches(left); + unsigned RScratch = calcMaxScratches(right); + if (LScratch != RScratch) + return LScratch > RScratch ? right : left; + + bool const DisableSchedCycles = false; + if (!DisableSchedCycles) { + int result = BUCompareLatency(left, right); + if (result != 0) + return result > 0 ? right : left; + return left; + } + else { + if (left->getHeight() != right->getHeight()) + return (left->getHeight() > right->getHeight()) ? right : left; + + if (left->getDepth() != right->getDepth()) + return (left->getDepth() < right->getDepth()) ? right : left; + } + + assert(left->NodeQueueId && right->NodeQueueId && + "NodeQueueId cannot be zero"); + return (left->NodeQueueId > right->NodeQueueId) ? right : left; +} + +GCNILPScheduler::Candidate* GCNILPScheduler::pickCandidate() { + if (AvailQueue.empty()) + return nullptr; + auto Best = AvailQueue.begin(); + for (auto I = std::next(AvailQueue.begin()), E = AvailQueue.end(); I != E; ++I) { + auto NewBestSU = pickBest(Best->SU, I->SU); + if (NewBestSU != Best->SU) { + assert(NewBestSU == I->SU); + Best = I; + } + } + return &*Best; +} + +void GCNILPScheduler::releasePending() { + // Check to see if any of the pending instructions are ready to issue. If + // so, add them to the available queue. + for(auto I = PendingQueue.begin(), E = PendingQueue.end(); I != E;) { + auto &C = *I++; + if (C.SU->getHeight() <= CurCycle) { + PendingQueue.remove(C); + AvailQueue.push_back(C); + C.SU->NodeQueueId = CurQueueId++; + } + } +} + +/// Move the scheduler state forward by the specified number of Cycles. +void GCNILPScheduler::advanceToCycle(unsigned NextCycle) { + if (NextCycle <= CurCycle) + return; + CurCycle = NextCycle; + releasePending(); +} + +void GCNILPScheduler::releasePredecessors(const SUnit* SU) { + for (const auto &PredEdge : SU->Preds) { + auto PredSU = PredEdge.getSUnit(); + if (PredEdge.isWeak()) + continue; + assert(PredSU->isBoundaryNode() || PredSU->NumSuccsLeft > 0); + + PredSU->setHeightToAtLeast(SU->getHeight() + PredEdge.getLatency()); + + if (!PredSU->isBoundaryNode() && --PredSU->NumSuccsLeft == 0) + PendingQueue.push_front(*new (Alloc.Allocate()) Candidate(PredSU)); + } +} + +std::vector<const SUnit*> +GCNILPScheduler::schedule(ArrayRef<const SUnit*> BotRoots, + const ScheduleDAG &DAG) { + auto &SUnits = const_cast<ScheduleDAG&>(DAG).SUnits; + + std::vector<SUnit> SUSavedCopy; + SUSavedCopy.resize(SUnits.size()); + + // we cannot save only those fields we touch: some of them are private + // so save units verbatim: this assumes SUnit should have value semantics + for (const SUnit &SU : SUnits) + SUSavedCopy[SU.NodeNum] = SU; + + SUNumbers.assign(SUnits.size(), 0); + for (const SUnit &SU : SUnits) + CalcNodeSethiUllmanNumber(&SU, SUNumbers); + + for (auto SU : BotRoots) { + AvailQueue.push_back( + *new (Alloc.Allocate()) Candidate(const_cast<SUnit*>(SU))); + } + releasePredecessors(&DAG.ExitSU); + + std::vector<const SUnit*> Schedule; + Schedule.reserve(SUnits.size()); + while (true) { + if (AvailQueue.empty() && !PendingQueue.empty()) { + auto EarliestSU = std::min_element( + PendingQueue.begin(), PendingQueue.end(), + [=](const Candidate& C1, const Candidate& C2) { + return C1.SU->getHeight() < C2.SU->getHeight(); + })->SU; + advanceToCycle(std::max(CurCycle + 1, EarliestSU->getHeight())); + } + if (AvailQueue.empty()) + break; + + DEBUG( + dbgs() << "\n=== Picking candidate\n" + "Ready queue:"; + for (auto &C : AvailQueue) + dbgs() << ' ' << C.SU->NodeNum; + dbgs() << '\n'; + ); + + auto C = pickCandidate(); + assert(C); + AvailQueue.remove(*C); + auto SU = C->SU; + DEBUG(dbgs() << "Selected "; SU->dump(&DAG)); + + advanceToCycle(SU->getHeight()); + + releasePredecessors(SU); + Schedule.push_back(SU); + SU->isScheduled = true; + } + assert(SUnits.size() == Schedule.size()); + + std::reverse(Schedule.begin(), Schedule.end()); + + // restore units + for (auto &SU : SUnits) + SU = SUSavedCopy[SU.NodeNum]; + + return Schedule; +} + +namespace llvm { +std::vector<const SUnit*> makeGCNILPScheduler(ArrayRef<const SUnit*> BotRoots, + const ScheduleDAG &DAG) { + GCNILPScheduler S; + return S.schedule(BotRoots, DAG); +} +} diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 2e7641cda375..a0e4f7ff24cb 100644 --- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -1,4 +1,4 @@ -//===--------------------- GCNIterativeScheduler.cpp - --------------------===// +//===- GCNIterativeScheduler.cpp ------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -6,21 +6,40 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -/// \file -// -//===----------------------------------------------------------------------===// #include "GCNIterativeScheduler.h" +#include "AMDGPUSubtarget.h" +#include "GCNRegPressure.h" #include "GCNSchedStrategy.h" -#include "SIMachineFunctionInfo.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> +#include <iterator> +#include <limits> +#include <memory> +#include <type_traits> +#include <vector> using namespace llvm; #define DEBUG_TYPE "machine-scheduler" namespace llvm { - std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots, + +std::vector<const SUnit *> makeMinRegSchedule(ArrayRef<const SUnit *> TopRoots, + const ScheduleDAG &DAG); + + std::vector<const SUnit*> makeGCNILPScheduler(ArrayRef<const SUnit*> BotRoots, const ScheduleDAG &DAG); } @@ -44,8 +63,8 @@ static void printRegion(raw_ostream &OS, unsigned MaxInstNum = std::numeric_limits<unsigned>::max()) { auto BB = Begin->getParent(); - OS << BB->getParent()->getName() << ":BB#" << BB->getNumber() - << ' ' << BB->getName() << ":\n"; + OS << BB->getParent()->getName() << ":" << printMBBReference(*BB) << ' ' + << BB->getName() << ":\n"; auto I = Begin; MaxInstNum = std::max(MaxInstNum, 1u); for (; I != End && MaxInstNum; ++I, --MaxInstNum) { @@ -117,13 +136,14 @@ void GCNIterativeScheduler::printSchedRP(raw_ostream &OS, OS << "RP after: "; After.print(OS, &ST); } - #endif // DAG builder helper class GCNIterativeScheduler::BuildDAG { GCNIterativeScheduler &Sch; - SmallVector<SUnit*, 8> TopRoots; + SmallVector<SUnit *, 8> TopRoots; + + SmallVector<SUnit*, 8> BotRoots; public: BuildDAG(const Region &R, GCNIterativeScheduler &_Sch) : Sch(_Sch) { @@ -134,17 +154,20 @@ public: Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr, /*TrackLaneMask*/true); Sch.Topo.InitDAGTopologicalSorting(); - - SmallVector<SUnit*, 8> BotRoots; Sch.findRootsAndBiasEdges(TopRoots, BotRoots); } + ~BuildDAG() { Sch.BaseClass::exitRegion(); Sch.BaseClass::finishBlock(); } - ArrayRef<const SUnit*> getTopRoots() const { + + ArrayRef<const SUnit *> getTopRoots() const { return TopRoots; } + ArrayRef<SUnit*> getBottomRoots() const { + return BotRoots; + } }; class GCNIterativeScheduler::OverrideLegacyStrategy { @@ -152,6 +175,7 @@ class GCNIterativeScheduler::OverrideLegacyStrategy { Region &Rgn; std::unique_ptr<MachineSchedStrategy> SaveSchedImpl; GCNRegPressure SaveMaxRP; + public: OverrideLegacyStrategy(Region &R, MachineSchedStrategy &OverrideStrategy, @@ -165,12 +189,14 @@ public: Sch.BaseClass::startBlock(BB); Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs); } + ~OverrideLegacyStrategy() { Sch.BaseClass::exitRegion(); Sch.BaseClass::finishBlock(); Sch.SchedImpl.release(); Sch.SchedImpl = std::move(SaveSchedImpl); } + void schedule() { assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End); DEBUG(dbgs() << "\nScheduling "; @@ -183,6 +209,7 @@ public: Rgn.Begin = Sch.RegionBegin; Rgn.MaxPressure.clear(); } + void restoreOrder() { assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End); // DAG SUnits are stored using original region's order @@ -192,6 +219,7 @@ public: }; namespace { + // just a stub to make base class happy class SchedStrategyStub : public MachineSchedStrategy { public: @@ -203,7 +231,8 @@ public: void releaseTopNode(SUnit *SU) override {} void releaseBottomNode(SUnit *SU) override {} }; -} // namespace + +} // end anonymous namespace GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C, StrategyKind S) @@ -298,6 +327,7 @@ void GCNIterativeScheduler::finalizeSchedule() { // overriden case SCHEDULE_MINREGONLY: scheduleMinReg(); break; case SCHEDULE_MINREGFORCED: scheduleMinReg(true); break; case SCHEDULE_LEGACYMAXOCCUPANCY: scheduleLegacyMaxOccupancy(); break; + case SCHEDULE_ILP: scheduleILP(false); break; } } @@ -528,3 +558,43 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) { MaxPressure = RP; } } + +/////////////////////////////////////////////////////////////////////////////// +// ILP scheduler port + +void GCNIterativeScheduler::scheduleILP( + bool TryMaximizeOccupancy) { + const auto &ST = MF.getSubtarget<SISubtarget>(); + auto TgtOcc = std::min(ST.getOccupancyWithLocalMemSize(MF), + ST.getWavesPerEU(MF.getFunction()).second); + + sortRegionsByPressure(TgtOcc); + auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); + + if (TryMaximizeOccupancy && Occ < TgtOcc) + Occ = tryMaximizeOccupancy(TgtOcc); + + TgtOcc = std::min(Occ, TgtOcc); + DEBUG(dbgs() << "Scheduling using default scheduler, " + "target occupancy = " << TgtOcc << '\n'); + + for (auto R : Regions) { + BuildDAG DAG(*R, *this); + const auto ILPSchedule = makeGCNILPScheduler(DAG.getBottomRoots(), *this); + + const auto RP = getSchedulePressure(*R, ILPSchedule); + DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); + + if (RP.getOccupancy(ST) < TgtOcc) { + DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc); + if (R->BestSchedule.get() && + R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) { + DEBUG(dbgs() << ", scheduling minimal register\n"); + scheduleBest(*R); + } + } else { + scheduleRegion(*R, ILPSchedule, RP); + DEBUG(printSchedResult(dbgs(), R, RP)); + } + } +} diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.h b/lib/Target/AMDGPU/GCNIterativeScheduler.h index df3afce21ebc..14ef5147f32a 100644 --- a/lib/Target/AMDGPU/GCNIterativeScheduler.h +++ b/lib/Target/AMDGPU/GCNIterativeScheduler.h @@ -1,4 +1,4 @@ -//===--------- GCNIterativeScheduler.h - GCN Scheduler -*- C++ -*----------===// +//===- GCNIterativeScheduler.h - GCN Scheduler ------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -6,27 +6,34 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -/// \file -// -//===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H #define LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H #include "GCNRegPressure.h" - +#include "llvm/ADT/ArrayRef.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/Support/Allocator.h" +#include <limits> +#include <memory> +#include <vector> namespace llvm { +class MachineInstr; +class SUnit; +class raw_ostream; + class GCNIterativeScheduler : public ScheduleDAGMILive { - typedef ScheduleDAGMILive BaseClass; + using BaseClass = ScheduleDAGMILive; + public: enum StrategyKind { SCHEDULE_MINREGONLY, SCHEDULE_MINREGFORCED, - SCHEDULE_LEGACYMAXOCCUPANCY + SCHEDULE_LEGACYMAXOCCUPANCY, + SCHEDULE_ILP }; GCNIterativeScheduler(MachineSchedContext *C, @@ -42,11 +49,10 @@ public: void finalizeSchedule() override; protected: - - typedef ArrayRef<const SUnit*> ScheduleRef; + using ScheduleRef = ArrayRef<const SUnit *>; struct TentativeSchedule { - std::vector<MachineInstr*> Schedule; + std::vector<MachineInstr *> Schedule; GCNRegPressure MaxPressure; }; @@ -103,6 +109,7 @@ protected: void scheduleLegacyMaxOccupancy(bool TryMaximizeOccupancy = true); void scheduleMinReg(bool force = false); + void scheduleILP(bool TryMaximizeOccupancy = true); void printRegions(raw_ostream &OS) const; void printSchedResult(raw_ostream &OS, @@ -113,6 +120,6 @@ protected: const GCNRegPressure &After) const; }; -} // End namespace llvm +} // end namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp index 0657f67b217d..9904b5f0f4ba 100644 --- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp +++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp @@ -1,4 +1,4 @@ -//===----------------------- GCNMinRegStrategy.cpp - ----------------------===// +//===- GCNMinRegStrategy.cpp ----------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -6,18 +6,27 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -/// \file -// -//===----------------------------------------------------------------------===// +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/ilist_node.h" +#include "llvm/ADT/simple_ilist.h" #include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <cstdint> +#include <limits> +#include <vector> using namespace llvm; #define DEBUG_TYPE "machine-scheduler" namespace { + class GCNMinRegScheduler { struct Candidate : ilist_node<Candidate> { const SUnit *SU; @@ -28,7 +37,7 @@ class GCNMinRegScheduler { }; SpecificBumpPtrAllocator<Candidate> Alloc; - typedef simple_ilist<Candidate> Queue; + using Queue = simple_ilist<Candidate>; Queue RQ; // Ready queue std::vector<unsigned> NumPreds; @@ -72,7 +81,8 @@ public: std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots, const ScheduleDAG &DAG); }; -} // namespace + +} // end anonymous namespace void GCNMinRegScheduler::initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits) { NumPreds.resize(SUnits.size()); @@ -104,7 +114,9 @@ int GCNMinRegScheduler::getNotReadySuccessors(const SUnit *SU) const { template <typename Calc> unsigned GCNMinRegScheduler::findMax(unsigned Num, Calc C) { assert(!RQ.empty() && Num <= RQ.size()); - typedef decltype(C(*RQ.begin())) T; + + using T = decltype(C(*RQ.begin())) ; + T Max = std::numeric_limits<T>::min(); unsigned NumMax = 0; for (auto I = RQ.begin(); Num; --Num) { @@ -260,9 +272,11 @@ GCNMinRegScheduler::schedule(ArrayRef<const SUnit*> TopRoots, } namespace llvm { + std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots, const ScheduleDAG &DAG) { GCNMinRegScheduler S; return S.schedule(TopRoots, DAG); } -} + +} // end namespace llvm diff --git a/lib/Target/AMDGPU/GCNProcessors.td b/lib/Target/AMDGPU/GCNProcessors.td new file mode 100644 index 000000000000..b2a3f652abd8 --- /dev/null +++ b/lib/Target/AMDGPU/GCNProcessors.td @@ -0,0 +1,154 @@ +//===-- GCNProcessors.td - GCN Processor definitions ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// The code produced for "generic" is only useful for tests and cannot +// reasonably be expected to execute on any particular target. +def : ProcessorModel<"generic", NoSchedModel, + [FeatureGCN, FeatureWavefrontSize64] +>; + +//===----------------------------------------------------------------------===// +// GCN GFX6 (Southern Islands (SI)). +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"gfx600", SIFullSpeedModel, + [FeatureISAVersion6_0_0] +>; + +def : ProcessorModel<"tahiti", SIFullSpeedModel, + [FeatureISAVersion6_0_0] +>; + +def : ProcessorModel<"gfx601", SIQuarterSpeedModel, + [FeatureISAVersion6_0_1] +>; + +def : ProcessorModel<"hainan", SIQuarterSpeedModel, + [FeatureISAVersion6_0_1] +>; + +def : ProcessorModel<"oland", SIQuarterSpeedModel, + [FeatureISAVersion6_0_1] +>; + +def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, + [FeatureISAVersion6_0_1] +>; + +def : ProcessorModel<"verde", SIQuarterSpeedModel, + [FeatureISAVersion6_0_1] +>; + +//===----------------------------------------------------------------------===// +// GCN GFX7 (Sea Islands (CI)). +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"gfx700", SIQuarterSpeedModel, + [FeatureISAVersion7_0_0] +>; + +def : ProcessorModel<"kaveri", SIQuarterSpeedModel, + [FeatureISAVersion7_0_0] +>; + +def : ProcessorModel<"gfx701", SIFullSpeedModel, + [FeatureISAVersion7_0_1] +>; + +def : ProcessorModel<"hawaii", SIFullSpeedModel, + [FeatureISAVersion7_0_1] +>; + +def : ProcessorModel<"gfx702", SIQuarterSpeedModel, + [FeatureISAVersion7_0_2] +>; + +def : ProcessorModel<"gfx703", SIQuarterSpeedModel, + [FeatureISAVersion7_0_3] +>; + +def : ProcessorModel<"kabini", SIQuarterSpeedModel, + [FeatureISAVersion7_0_3] +>; + +def : ProcessorModel<"mullins", SIQuarterSpeedModel, + [FeatureISAVersion7_0_3] +>; + +def : ProcessorModel<"gfx704", SIQuarterSpeedModel, + [FeatureISAVersion7_0_4] +>; + +def : ProcessorModel<"bonaire", SIQuarterSpeedModel, + [FeatureISAVersion7_0_4] +>; + +//===----------------------------------------------------------------------===// +// GCN GFX8 (Volcanic Islands (VI)). +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"gfx800", SIQuarterSpeedModel, + [FeatureISAVersion8_0_0] +>; + +def : ProcessorModel<"iceland", SIQuarterSpeedModel, + [FeatureISAVersion8_0_0] +>; + +def : ProcessorModel<"gfx801", SIQuarterSpeedModel, + [FeatureISAVersion8_0_1] +>; + +def : ProcessorModel<"carrizo", SIQuarterSpeedModel, + [FeatureISAVersion8_0_1] +>; + +def : ProcessorModel<"gfx802", SIQuarterSpeedModel, + [FeatureISAVersion8_0_2] +>; + +def : ProcessorModel<"tonga", SIQuarterSpeedModel, + [FeatureISAVersion8_0_2] +>; + +def : ProcessorModel<"gfx803", SIQuarterSpeedModel, + [FeatureISAVersion8_0_3] +>; + +def : ProcessorModel<"fiji", SIQuarterSpeedModel, + [FeatureISAVersion8_0_3] +>; + +def : ProcessorModel<"polaris10", SIQuarterSpeedModel, + [FeatureISAVersion8_0_3] +>; + +def : ProcessorModel<"polaris11", SIQuarterSpeedModel, + [FeatureISAVersion8_0_3] +>; + +def : ProcessorModel<"gfx810", SIQuarterSpeedModel, + [FeatureISAVersion8_1_0] +>; + +def : ProcessorModel<"stoney", SIQuarterSpeedModel, + [FeatureISAVersion8_1_0] +>; + +//===----------------------------------------------------------------------===// +// GCN GFX9. +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"gfx900", SIQuarterSpeedModel, + [FeatureISAVersion9_0_0] +>; + +def : ProcessorModel<"gfx902", SIQuarterSpeedModel, + [FeatureISAVersion9_0_2] +>; diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp index 1d02c7fdffbf..992bb7cceb6f 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -1,4 +1,4 @@ -//===------------------------- GCNRegPressure.cpp - -----------------------===// +//===- GCNRegPressure.cpp -------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -6,13 +6,26 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -/// \file -// -//===----------------------------------------------------------------------===// #include "GCNRegPressure.h" +#include "AMDGPUSubtarget.h" +#include "SIRegisterInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> using namespace llvm; @@ -36,7 +49,7 @@ void llvm::printLivesAt(SlotIndex SI, for (const auto &S : LI.subranges()) { if (!S.liveAt(SI)) continue; if (firstTime) { - dbgs() << " " << PrintReg(Reg, MRI.getTargetRegisterInfo()) + dbgs() << " " << printReg(Reg, MRI.getTargetRegisterInfo()) << '\n'; firstTime = false; } @@ -63,7 +76,6 @@ static bool isEqual(const GCNRPTracker::LiveRegSet &S1, } return true; } - #endif /////////////////////////////////////////////////////////////////////////////// @@ -107,7 +119,7 @@ void GCNRegPressure::inc(unsigned Reg, assert(PrevMask < NewMask); Value[Kind == SGPR_TUPLE ? SGPR32 : VGPR32] += - Sign * countPopulation((~PrevMask & NewMask).getAsInteger()); + Sign * (~PrevMask & NewMask).getNumLanes(); if (PrevMask.none()) { assert(NewMask.any()); @@ -177,7 +189,6 @@ void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const { } #endif - static LaneBitmask getDefRegMask(const MachineOperand &MO, const MachineRegisterInfo &MRI) { assert(MO.isDef() && MO.isReg() && @@ -201,7 +212,7 @@ static LaneBitmask getUsedRegMask(const MachineOperand &MO, return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg); auto MaxMask = MRI.getMaxLaneMaskForVReg(MO.getReg()); - if (MaxMask.getAsInteger() == 1) // cannot have subregs + if (MaxMask == LaneBitmask::getLane(0)) // cannot have subregs return MaxMask; // For a tentative schedule LIS isn't updated yet but livemask should remain @@ -430,12 +441,12 @@ static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR, for (auto const &P : TrackedLR) { auto I = LISLR.find(P.first); if (I == LISLR.end()) { - dbgs() << " " << PrintReg(P.first, TRI) + dbgs() << " " << printReg(P.first, TRI) << ":L" << PrintLaneMask(P.second) << " isn't found in LIS reported set\n"; } else if (I->second != P.second) { - dbgs() << " " << PrintReg(P.first, TRI) + dbgs() << " " << printReg(P.first, TRI) << " masks doesn't match: LIS reported " << PrintLaneMask(I->second) << ", tracked " @@ -446,7 +457,7 @@ static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR, for (auto const &P : LISLR) { auto I = TrackedLR.find(P.first); if (I == TrackedLR.end()) { - dbgs() << " " << PrintReg(P.first, TRI) + dbgs() << " " << printReg(P.first, TRI) << ":L" << PrintLaneMask(P.second) << " isn't found in tracked set\n"; } @@ -484,7 +495,7 @@ void GCNRPTracker::printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs, unsigned Reg = TargetRegisterInfo::index2VirtReg(I); auto It = LiveRegs.find(Reg); if (It != LiveRegs.end() && It->second.any()) - OS << ' ' << PrintVRegOrUnit(Reg, TRI) << ':' + OS << ' ' << printVRegOrUnit(Reg, TRI) << ':' << PrintLaneMask(It->second); } OS << '\n'; diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h index 5dfe44053e72..e418aa0fe911 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.h +++ b/lib/Target/AMDGPU/GCNRegPressure.h @@ -1,4 +1,4 @@ -//===---------------------- GCNRegPressure.h -*- C++ -*--------------------===// +//===- GCNRegPressure.h -----------------------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -6,20 +6,26 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -/// \file -// -//===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H #define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H #include "AMDGPUSubtarget.h" - +#include "llvm/ADT/DenseMap.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/Support/Debug.h" +#include <algorithm> #include <limits> namespace llvm { +class MachineRegisterInfo; +class raw_ostream; + struct GCNRegPressure { enum RegKind { SGPR32, @@ -68,7 +74,7 @@ struct GCNRegPressure { return !(*this == O); } - void print(raw_ostream &OS, const SISubtarget *ST=nullptr) const; + void print(raw_ostream &OS, const SISubtarget *ST = nullptr) const; void dump() const { print(dbgs()); } private: @@ -89,7 +95,7 @@ inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) { class GCNRPTracker { public: - typedef DenseMap<unsigned, LaneBitmask> LiveRegSet; + using LiveRegSet = DenseMap<unsigned, LaneBitmask>; protected: const LiveIntervals &LIS; @@ -97,7 +103,9 @@ protected: GCNRegPressure CurPressure, MaxPressure; const MachineInstr *LastTrackedMI = nullptr; mutable const MachineRegisterInfo *MRI = nullptr; + GCNRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {} + public: // live regs for the current state const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; } @@ -111,9 +119,11 @@ public: MaxPressure.clear(); return Res; } + decltype(LiveRegs) moveLiveRegs() { return std::move(LiveRegs); } + static void printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs, const MachineRegisterInfo &MRI); }; @@ -121,6 +131,7 @@ public: class GCNUpwardRPTracker : public GCNRPTracker { public: GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {} + // reset tracker to the point just below MI // filling live regs upon this point using LIS void reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr); @@ -202,6 +213,6 @@ void printLivesAt(SlotIndex SI, const LiveIntervals &LIS, const MachineRegisterInfo &MRI); -} // End namespace llvm +} // end namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 155b400ba022..d414b899050a 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -37,7 +37,7 @@ static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs, ST.getOccupancyWithNumVGPRs(VGPRs)); return std::min(MinRegOccupancy, ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), - *MF.getFunction())); + MF.getFunction())); } void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) { @@ -315,7 +315,7 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C, ST(MF.getSubtarget<SISubtarget>()), MFI(*MF.getInfo<SIMachineFunctionInfo>()), StartingOccupancy(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(), - *MF.getFunction())), + MF.getFunction())), MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) { DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); @@ -330,8 +330,9 @@ void GCNScheduleDAGMILive::schedule() { std::vector<MachineInstr*> Unsched; Unsched.reserve(NumRegionInstrs); - for (auto &I : *this) + for (auto &I : *this) { Unsched.push_back(&I); + } GCNRegPressure PressureBefore; if (LIS) { @@ -387,10 +388,14 @@ void GCNScheduleDAGMILive::schedule() { DEBUG(dbgs() << "Attempting to revert scheduling.\n"); RegionEnd = RegionBegin; for (MachineInstr *MI : Unsched) { + if (MI->isDebugValue()) + continue; + if (MI->getIterator() != RegionEnd) { BB->remove(MI); BB->insert(RegionEnd, MI); - LIS->handleMove(*MI, true); + if (!MI->isDebugValue()) + LIS->handleMove(*MI, true); } // Reset read-undef flags and update them later. for (auto &Op : MI->operands()) @@ -398,13 +403,15 @@ void GCNScheduleDAGMILive::schedule() { Op.setIsUndef(false); RegisterOperands RegOpers; RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false); - if (ShouldTrackLaneMasks) { - // Adjust liveness and add missing dead+read-undef flags. - SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); - RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI); - } else { - // Adjust for missing dead-def flags. - RegOpers.detectDeadDefs(*MI, *LIS); + if (!MI->isDebugValue()) { + if (ShouldTrackLaneMasks) { + // Adjust liveness and add missing dead+read-undef flags. + SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); + RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI); + } else { + // Adjust for missing dead-def flags. + RegOpers.detectDeadDefs(*MI, *LIS); + } } RegionEnd = MI->getIterator(); ++RegionEnd; @@ -531,9 +538,8 @@ void GCNScheduleDAGMILive::finalizeSchedule() { } DEBUG(dbgs() << "********** MI Scheduling **********\n"); - DEBUG(dbgs() << MF.getName() - << ":BB#" << MBB->getNumber() << " " << MBB->getName() - << "\n From: " << *begin() << " To: "; + DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*MBB) << " " + << MBB->getName() << "\n From: " << *begin() << " To: "; if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; else dbgs() << "End"; dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index a844081db5b2..67663d39967c 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -72,9 +72,9 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff); } -void AMDGPUInstPrinter::printS16ImmDecOperand(const MCInst *MI, unsigned OpNo, +void AMDGPUInstPrinter::printS13ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - O << formatDec(static_cast<int16_t>(MI->getOperand(OpNo).getImm())); + O << formatDec(SignExtend32<13>(MI->getOperand(OpNo).getImm())); } void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, @@ -129,7 +129,7 @@ void AMDGPUInstPrinter::printOffsetS13(const MCInst *MI, unsigned OpNo, uint16_t Imm = MI->getOperand(OpNo).getImm(); if (Imm != 0) { O << ((OpNo == 0)? "offset:" : " offset:"); - printS16ImmDecOperand(MI, OpNo, O); + printS13ImmDecOperand(MI, OpNo, O); } } @@ -344,16 +344,6 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(RegNo)) { O << 's'; NumRegs = 16; - } else if (MRI.getRegClass(AMDGPU::TTMP_64RegClassID).contains(RegNo)) { - O << "ttmp"; - NumRegs = 2; - // Trap temps start at offset 112. TODO: Get this from tablegen. - RegIdx -= 112; - } else if (MRI.getRegClass(AMDGPU::TTMP_128RegClassID).contains(RegNo)) { - O << "ttmp"; - NumRegs = 4; - // Trap temps start at offset 112. TODO: Get this from tablegen. - RegIdx -= 112; } else { O << getRegisterName(RegNo); return; @@ -496,6 +486,11 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { + if (!STI.getFeatureBits()[AMDGPU::FeatureGCN]) { + static_cast<R600InstPrinter*>(this)->printOperand(MI, OpNo, O); + return; + } + if (OpNo >= MI->getNumOperands()) { O << "/*Missing OP" << OpNo << "*/"; return; @@ -503,15 +498,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, const MCOperand &Op = MI->getOperand(OpNo); if (Op.isReg()) { - switch (Op.getReg()) { - // This is the default predicate state, so we don't need to print it. - case AMDGPU::PRED_SEL_OFF: - break; - - default: - printRegOperand(Op.getReg(), O, MRI); - break; - } + printRegOperand(Op.getReg(), O, MRI); } else if (Op.isImm()) { const MCInstrDesc &Desc = MII.get(MI->getOpcode()); switch (Desc.OpInfo[OpNo].OperandType) { @@ -808,19 +795,25 @@ void AMDGPUInstPrinter::printExpTgt(const MCInst *MI, unsigned OpNo, } } -static bool allOpsDefaultValue(const int* Ops, int NumOps, int Mod) { - int DefaultValue = (Mod == SISrcMods::OP_SEL_1); +static bool allOpsDefaultValue(const int* Ops, int NumOps, int Mod, + bool IsPacked, bool HasDstSel) { + int DefaultValue = IsPacked && (Mod == SISrcMods::OP_SEL_1); for (int I = 0; I < NumOps; ++I) { if (!!(Ops[I] & Mod) != DefaultValue) return false; } + if (HasDstSel && (Ops[0] & SISrcMods::DST_OP_SEL) != 0) + return false; + return true; } -static void printPackedModifier(const MCInst *MI, StringRef Name, unsigned Mod, - raw_ostream &O) { +void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI, + StringRef Name, + unsigned Mod, + raw_ostream &O) { unsigned Opc = MI->getOpcode(); int NumOps = 0; int Ops[3]; @@ -835,7 +828,15 @@ static void printPackedModifier(const MCInst *MI, StringRef Name, unsigned Mod, Ops[NumOps++] = MI->getOperand(Idx).getImm(); } - if (allOpsDefaultValue(Ops, NumOps, Mod)) + const bool HasDstSel = + NumOps > 0 && + Mod == SISrcMods::OP_SEL_0 && + MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3_OPSEL; + + const bool IsPacked = + MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsPacked; + + if (allOpsDefaultValue(Ops, NumOps, Mod, IsPacked, HasDstSel)) return; O << Name; @@ -846,6 +847,10 @@ static void printPackedModifier(const MCInst *MI, StringRef Name, unsigned Mod, O << !!(Ops[I] & Mod); } + if (HasDstSel) { + O << ',' << !!(Ops[0] & SISrcMods::DST_OP_SEL); + } + O << ']'; } @@ -931,6 +936,11 @@ void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { + if (!STI.getFeatureBits()[AMDGPU::FeatureGCN]) { + static_cast<R600InstPrinter*>(this)->printMemOperand(MI, OpNo, O); + return; + } + printOperand(MI, OpNo, STI, O); O << ", "; printOperand(MI, OpNo + 1, STI, O); @@ -958,12 +968,19 @@ void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - printIfSet(MI, OpNo, O, '|'); + static_cast<R600InstPrinter*>(this)->printAbs(MI, OpNo, O); } void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - printIfSet(MI, OpNo, O, "_SAT"); + static_cast<R600InstPrinter*>(this)->printClamp(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printHigh(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) + O << " high"; } void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo, @@ -988,172 +1005,65 @@ void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - assert(Op.isImm() || Op.isExpr()); - if (Op.isImm()) { - int64_t Imm = Op.getImm(); - O << Imm << '(' << BitsToFloat(Imm) << ')'; - } - if (Op.isExpr()) { - Op.getExpr()->print(O << '@', &MAI); - } + static_cast<R600InstPrinter*>(this)->printLiteral(MI, OpNo, O); } void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - printIfSet(MI, OpNo, O, "*", " "); + static_cast<R600InstPrinter*>(this)->printLast(MI, OpNo, O); } void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - printIfSet(MI, OpNo, O, '-'); + static_cast<R600InstPrinter*>(this)->printNeg(MI, OpNo, O); } void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - switch (MI->getOperand(OpNo).getImm()) { - default: break; - case 1: - O << " * 2.0"; - break; - case 2: - O << " * 4.0"; - break; - case 3: - O << " / 2.0"; - break; - } + static_cast<R600InstPrinter*>(this)->printOMOD(MI, OpNo, O); } void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - printIfSet(MI, OpNo, O, '+'); + static_cast<R600InstPrinter*>(this)->printRel(MI, OpNo, O); } void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - printIfSet(MI, OpNo, O, "ExecMask,"); + static_cast<R600InstPrinter*>(this)->printUpdateExecMask(MI, OpNo, O); } void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - printIfSet(MI, OpNo, O, "Pred,"); + static_cast<R600InstPrinter*>(this)->printUpdatePred(MI, OpNo, O); } void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.getImm() == 0) { - O << " (MASKED)"; - } -} - -void AMDGPUInstPrinter::printSel(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const char * chans = "XYZW"; - int sel = MI->getOperand(OpNo).getImm(); - - int chan = sel & 3; - sel >>= 2; - - if (sel >= 512) { - sel -= 512; - int cb = sel >> 12; - sel &= 4095; - O << cb << '[' << sel << ']'; - } else if (sel >= 448) { - sel -= 448; - O << sel; - } else if (sel >= 0){ - O << sel; - } - - if (sel >= 0) - O << '.' << chans[chan]; + static_cast<R600InstPrinter*>(this)->printWrite(MI, OpNo, O); } void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - int BankSwizzle = MI->getOperand(OpNo).getImm(); - switch (BankSwizzle) { - case 1: - O << "BS:VEC_021/SCL_122"; - break; - case 2: - O << "BS:VEC_120/SCL_212"; - break; - case 3: - O << "BS:VEC_102/SCL_221"; - break; - case 4: - O << "BS:VEC_201"; - break; - case 5: - O << "BS:VEC_210"; - break; - default: - break; - } + static_cast<R600InstPrinter*>(this)->printBankSwizzle(MI, OpNo, O); } void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - unsigned Sel = MI->getOperand(OpNo).getImm(); - switch (Sel) { - case 0: - O << 'X'; - break; - case 1: - O << 'Y'; - break; - case 2: - O << 'Z'; - break; - case 3: - O << 'W'; - break; - case 4: - O << '0'; - break; - case 5: - O << '1'; - break; - case 7: - O << '_'; - break; - default: - break; - } + static_cast<R600InstPrinter*>(this)->printRSel(MI, OpNo, O); } void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - unsigned CT = MI->getOperand(OpNo).getImm(); - switch (CT) { - case 0: - O << 'U'; - break; - case 1: - O << 'N'; - break; - default: - break; - } + static_cast<R600InstPrinter*>(this)->printCT(MI, OpNo, O); } void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - int KCacheMode = MI->getOperand(OpNo).getImm(); - if (KCacheMode > 0) { - int KCacheBank = MI->getOperand(OpNo - 2).getImm(); - O << "CB" << KCacheBank << ':'; - int KCacheAddr = MI->getOperand(OpNo + 2).getImm(); - int LineSize = (KCacheMode == 1) ? 16 : 32; - O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize; - } + static_cast<R600InstPrinter*>(this)->printKCache(MI, OpNo, O); } void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, @@ -1356,3 +1266,198 @@ void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo, } #include "AMDGPUGenAsmWriter.inc" + +void R600InstPrinter::printAbs(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '|'); +} + +void R600InstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int BankSwizzle = MI->getOperand(OpNo).getImm(); + switch (BankSwizzle) { + case 1: + O << "BS:VEC_021/SCL_122"; + break; + case 2: + O << "BS:VEC_120/SCL_212"; + break; + case 3: + O << "BS:VEC_102/SCL_221"; + break; + case 4: + O << "BS:VEC_201"; + break; + case 5: + O << "BS:VEC_210"; + break; + default: + break; + } +} + +void R600InstPrinter::printClamp(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "_SAT"); +} + +void R600InstPrinter::printCT(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned CT = MI->getOperand(OpNo).getImm(); + switch (CT) { + case 0: + O << 'U'; + break; + case 1: + O << 'N'; + break; + default: + break; + } +} + +void R600InstPrinter::printKCache(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + int KCacheMode = MI->getOperand(OpNo).getImm(); + if (KCacheMode > 0) { + int KCacheBank = MI->getOperand(OpNo - 2).getImm(); + O << "CB" << KCacheBank << ':'; + int KCacheAddr = MI->getOperand(OpNo + 2).getImm(); + int LineSize = (KCacheMode == 1) ? 16 : 32; + O << KCacheAddr * 16 << '-' << KCacheAddr * 16 + LineSize; + } +} + +void R600InstPrinter::printLast(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "*", " "); +} + +void R600InstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + assert(Op.isImm() || Op.isExpr()); + if (Op.isImm()) { + int64_t Imm = Op.getImm(); + O << Imm << '(' << BitsToFloat(Imm) << ')'; + } + if (Op.isExpr()) { + Op.getExpr()->print(O << '@', &MAI); + } +} + +void R600InstPrinter::printNeg(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '-'); +} + +void R600InstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + switch (MI->getOperand(OpNo).getImm()) { + default: break; + case 1: + O << " * 2.0"; + break; + case 2: + O << " * 4.0"; + break; + case 3: + O << " / 2.0"; + break; + } +} + +void R600InstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printOperand(MI, OpNo, O); + O << ", "; + printOperand(MI, OpNo + 1, O); +} + +void R600InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (OpNo >= MI->getNumOperands()) { + O << "/*Missing OP" << OpNo << "*/"; + return; + } + + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + switch (Op.getReg()) { + // This is the default predicate state, so we don't need to print it. + case AMDGPU::PRED_SEL_OFF: + break; + + default: + O << getRegisterName(Op.getReg()); + break; + } + } else if (Op.isImm()) { + O << Op.getImm(); + } else if (Op.isFPImm()) { + // We special case 0.0 because otherwise it will be printed as an integer. + if (Op.getFPImm() == 0.0) + O << "0.0"; + else { + O << Op.getFPImm(); + } + } else if (Op.isExpr()) { + const MCExpr *Exp = Op.getExpr(); + Exp->print(O, &MAI); + } else { + O << "/*INV_OP*/"; + } +} + +void R600InstPrinter::printRel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '+'); +} + +void R600InstPrinter::printRSel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned Sel = MI->getOperand(OpNo).getImm(); + switch (Sel) { + case 0: + O << 'X'; + break; + case 1: + O << 'Y'; + break; + case 2: + O << 'Z'; + break; + case 3: + O << 'W'; + break; + case 4: + O << '0'; + break; + case 5: + O << '1'; + break; + case 7: + O << '_'; + break; + default: + break; + } +} + +void R600InstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "ExecMask,"); +} + +void R600InstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + AMDGPUInstPrinter::printIfSet(MI, OpNo, O, "Pred,"); +} + +void R600InstPrinter::printWrite(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.getImm() == 0) { + O << " (MASKED)"; + } +} diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h index 7bbf99a85f40..d97f04689e18 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -19,8 +19,8 @@ namespace llvm { class AMDGPUInstPrinter : public MCInstPrinter { public: - AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) + AMDGPUInstPrinter(const MCAsmInfo &MAI, + const MCInstrInfo &MII, const MCRegisterInfo &MRI) : MCInstPrinter(MAI, MII, MRI) {} //Autogenerated by tblgen @@ -42,7 +42,7 @@ private: void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printS16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printS13ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU32ImmOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O, @@ -127,6 +127,8 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printSDWADstUnused(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printPackedModifier(const MCInst *MI, StringRef Name, unsigned Mod, + raw_ostream &O); void printOpSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printOpSelHi(const MCInst *MI, unsigned OpNo, @@ -162,12 +164,16 @@ private: void printExpTgt(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); +public: static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef Asm, StringRef Default = ""); static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, char Asm); +protected: void printAbs(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printHigh(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printClamp(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printClampSI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, @@ -190,7 +196,6 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printWrite(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printBankSwizzle(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printRSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, @@ -209,6 +214,32 @@ private: raw_ostream &O); }; +// FIXME: R600 specific parts of AMDGPUInstrPrinter should be moved here, and +// MCTargetDesc should be using R600InstPrinter for the R600 target. +class R600InstPrinter : public AMDGPUInstPrinter { +public: + R600InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI) + : AMDGPUInstPrinter(MAI, MII, MRI) {} + + void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printUpdateExecMask(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O); +}; + } // End namespace llvm #endif diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index a50e3eb8d9ce..778d4a7ba9d0 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -11,6 +11,7 @@ #include "MCTargetDesc/AMDGPUFixupKinds.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/StringRef.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" @@ -43,6 +44,8 @@ public: llvm_unreachable("Not implemented"); } bool mayNeedRelaxation(const MCInst &Inst) const override { return false; } + + unsigned getMinimumNopSize() const override; bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; @@ -76,7 +79,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, MCContext *Ctx) { int64_t SignedValue = static_cast<int64_t>(Value); - switch (Fixup.getKind()) { + switch (static_cast<unsigned>(Fixup.getKind())) { case AMDGPU::fixup_si_sopp_br: { int64_t BrImm = (SignedValue - 4) / 4; @@ -133,6 +136,10 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( return Infos[Kind - FirstTargetFixupKind]; } +unsigned AMDGPUAsmBackend::getMinimumNopSize() const { + return 4; +} + bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { // If the count is not 4-byte aligned, we must be writing data into the text // section (otherwise we have unaligned instructions, and thus have far @@ -161,14 +168,30 @@ namespace { class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend { bool Is64Bit; bool HasRelocationAddend; + uint8_t OSABI = ELF::ELFOSABI_NONE; public: ELFAMDGPUAsmBackend(const Target &T, const Triple &TT) : AMDGPUAsmBackend(T), Is64Bit(TT.getArch() == Triple::amdgcn), - HasRelocationAddend(TT.getOS() == Triple::AMDHSA) { } + HasRelocationAddend(TT.getOS() == Triple::AMDHSA) { + switch (TT.getOS()) { + case Triple::AMDHSA: + OSABI = ELF::ELFOSABI_AMDGPU_HSA; + break; + case Triple::AMDPAL: + OSABI = ELF::ELFOSABI_AMDGPU_PAL; + break; + case Triple::Mesa3D: + OSABI = ELF::ELFOSABI_AMDGPU_MESA3D; + break; + default: + break; + } + } - MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { - return createAMDGPUELFObjectWriter(Is64Bit, HasRelocationAddend, OS); + std::unique_ptr<MCObjectWriter> + createObjectWriter(raw_pwrite_stream &OS) const override { + return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend, OS); } }; diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 6abe7f3d37d5..e443b0729606 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -12,6 +12,7 @@ #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/ErrorHandling.h" @@ -22,7 +23,7 @@ namespace { class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter { public: - AMDGPUELFObjectWriter(bool Is64Bit, bool HasRelocationAddend); + AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, bool HasRelocationAddend); protected: unsigned getRelocType(MCContext &Ctx, const MCValue &Target, @@ -33,10 +34,9 @@ protected: } // end anonymous namespace AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit, + uint8_t OSABI, bool HasRelocationAddend) - : MCELFObjectTargetWriter(Is64Bit, - ELF::ELFOSABI_AMDGPU_HSA, - ELF::EM_AMDGPU, + : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_AMDGPU, HasRelocationAddend) {} unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, @@ -82,10 +82,11 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, llvm_unreachable("unhandled relocation type"); } -MCObjectWriter *llvm::createAMDGPUELFObjectWriter(bool Is64Bit, - bool HasRelocationAddend, - raw_pwrite_stream &OS) { - MCELFObjectTargetWriter *MOTW = - new AMDGPUELFObjectWriter(Is64Bit, HasRelocationAddend); - return createELFObjectWriter(MOTW, OS, true); +std::unique_ptr<MCObjectWriter> +llvm::createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, + bool HasRelocationAddend, + raw_pwrite_stream &OS) { + auto MOTW = llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI, + HasRelocationAddend); + return createELFObjectWriter(std::move(MOTW), OS, true); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp index 43338a5bebd2..1497edc7a054 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp @@ -9,13 +9,40 @@ #include "AMDGPUELFStreamer.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCCodeEmitter.h" using namespace llvm; -MCELFStreamer *llvm::createAMDGPUELFStreamer(MCContext &Context, - MCAsmBackend &MAB, - raw_pwrite_stream &OS, - MCCodeEmitter *Emitter, - bool RelaxAll) { - return new AMDGPUELFStreamer(Context, MAB, OS, Emitter); +AMDGPUELFStreamer::AMDGPUELFStreamer(const Triple &T, MCContext &Context, + std::unique_ptr<MCAsmBackend> MAB, + raw_pwrite_stream &OS, + std::unique_ptr<MCCodeEmitter> Emitter) + : MCELFStreamer(Context, std::move(MAB), OS, std::move(Emitter)) { + unsigned Arch = ELF::EF_AMDGPU_ARCH_NONE; + switch (T.getArch()) { + case Triple::r600: + Arch = ELF::EF_AMDGPU_ARCH_R600; + break; + case Triple::amdgcn: + Arch = ELF::EF_AMDGPU_ARCH_GCN; + break; + default: + break; + } + + MCAssembler &MCA = getAssembler(); + unsigned EFlags = MCA.getELFHeaderEFlags(); + EFlags &= ~ELF::EF_AMDGPU_ARCH; + EFlags |= Arch; + MCA.setELFHeaderEFlags(EFlags); +} + +MCELFStreamer *llvm::createAMDGPUELFStreamer( + const Triple &T, MCContext &Context, std::unique_ptr<MCAsmBackend> MAB, + raw_pwrite_stream &OS, std::unique_ptr<MCCodeEmitter> Emitter, + bool RelaxAll) { + return new AMDGPUELFStreamer(T, Context, std::move(MAB), OS, + std::move(Emitter)); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h index 5319b65d65f9..0cc0a4c5cd5d 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h @@ -25,15 +25,16 @@ class MCSubtargetInfo; class AMDGPUELFStreamer : public MCELFStreamer { public: - AMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS, - MCCodeEmitter *Emitter) - : MCELFStreamer(Context, MAB, OS, Emitter) { } - + AMDGPUELFStreamer(const Triple &T, MCContext &Context, + std::unique_ptr<MCAsmBackend> MAB, raw_pwrite_stream &OS, + std::unique_ptr<MCCodeEmitter> Emitter); }; -MCELFStreamer *createAMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, - raw_pwrite_stream &OS, - MCCodeEmitter *Emitter, bool RelaxAll); +MCELFStreamer *createAMDGPUELFStreamer(const Triple &T, MCContext &Context, + std::unique_ptr<MCAsmBackend> MAB, + raw_pwrite_stream &OS, + std::unique_ptr<MCCodeEmitter> Emitter, + bool RelaxAll); } // namespace llvm. #endif diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp index 4e828a791e09..463e700f13b7 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp @@ -1,4 +1,4 @@ -//===--- AMDGPUCodeObjectMetadataStreamer.cpp -------------------*- C++ -*-===// +//===--- AMDGPUHSAMetadataStreamer.cpp --------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -8,12 +8,12 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief AMDGPU Code Object Metadata Streamer. +/// \brief AMDGPU HSA Metadata Streamer. /// // //===----------------------------------------------------------------------===// -#include "AMDGPUCodeObjectMetadataStreamer.h" +#include "AMDGPUHSAMetadataStreamer.h" #include "AMDGPU.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/IR/Constants.h" @@ -22,39 +22,40 @@ namespace llvm { -static cl::opt<bool> DumpCodeObjectMetadata( - "amdgpu-dump-comd", - cl::desc("Dump AMDGPU Code Object Metadata")); -static cl::opt<bool> VerifyCodeObjectMetadata( - "amdgpu-verify-comd", - cl::desc("Verify AMDGPU Code Object Metadata")); +static cl::opt<bool> DumpHSAMetadata( + "amdgpu-dump-hsa-metadata", + cl::desc("Dump AMDGPU HSA Metadata")); +static cl::opt<bool> VerifyHSAMetadata( + "amdgpu-verify-hsa-metadata", + cl::desc("Verify AMDGPU HSA Metadata")); namespace AMDGPU { -namespace CodeObject { +namespace HSAMD { -void MetadataStreamer::dump(StringRef YamlString) const { - errs() << "AMDGPU Code Object Metadata:\n" << YamlString << '\n'; +void MetadataStreamer::dump(StringRef HSAMetadataString) const { + errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n'; } -void MetadataStreamer::verify(StringRef YamlString) const { - errs() << "AMDGPU Code Object Metadata Parser Test: "; +void MetadataStreamer::verify(StringRef HSAMetadataString) const { + errs() << "AMDGPU HSA Metadata Parser Test: "; - CodeObject::Metadata FromYamlString; - if (Metadata::fromYamlString(YamlString, FromYamlString)) { + HSAMD::Metadata FromHSAMetadataString; + if (fromString(HSAMetadataString, FromHSAMetadataString)) { errs() << "FAIL\n"; return; } - std::string ToYamlString; - if (Metadata::toYamlString(FromYamlString, ToYamlString)) { + std::string ToHSAMetadataString; + if (toString(FromHSAMetadataString, ToHSAMetadataString)) { errs() << "FAIL\n"; return; } - errs() << (YamlString == ToYamlString ? "PASS" : "FAIL") << '\n'; - if (YamlString != ToYamlString) { - errs() << "Original input: " << YamlString << '\n' - << "Produced output: " << ToYamlString << '\n'; + errs() << (HSAMetadataString == ToHSAMetadataString ? "PASS" : "FAIL") + << '\n'; + if (HSAMetadataString != ToHSAMetadataString) { + errs() << "Original input: " << HSAMetadataString << '\n' + << "Produced output: " << ToHSAMetadataString << '\n'; } } @@ -196,14 +197,14 @@ std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions( } void MetadataStreamer::emitVersion() { - auto &Version = CodeObjectMetadata.mVersion; + auto &Version = HSAMetadata.mVersion; - Version.push_back(MetadataVersionMajor); - Version.push_back(MetadataVersionMinor); + Version.push_back(VersionMajor); + Version.push_back(VersionMinor); } void MetadataStreamer::emitPrintf(const Module &Mod) { - auto &Printf = CodeObjectMetadata.mPrintf; + auto &Printf = HSAMetadata.mPrintf; auto Node = Mod.getNamedMetadata("llvm.printf.fmts"); if (!Node) @@ -215,7 +216,7 @@ void MetadataStreamer::emitPrintf(const Module &Mod) { } void MetadataStreamer::emitKernelLanguage(const Function &Func) { - auto &Kernel = CodeObjectMetadata.mKernels.back(); + auto &Kernel = HSAMetadata.mKernels.back(); // TODO: What about other languages? auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version"); @@ -233,7 +234,7 @@ void MetadataStreamer::emitKernelLanguage(const Function &Func) { } void MetadataStreamer::emitKernelAttrs(const Function &Func) { - auto &Attrs = CodeObjectMetadata.mKernels.back().mAttrs; + auto &Attrs = HSAMetadata.mKernels.back().mAttrs; if (auto Node = Func.getMetadata("reqd_work_group_size")) Attrs.mReqdWorkGroupSize = getWorkGroupDimensions(Node); @@ -244,6 +245,10 @@ void MetadataStreamer::emitKernelAttrs(const Function &Func) { cast<ValueAsMetadata>(Node->getOperand(0))->getType(), mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue()); } + if (Func.hasFnAttribute("runtime-handle")) { + Attrs.mRuntimeHandle = + Func.getFnAttribute("runtime-handle").getValueAsString().str(); + } } void MetadataStreamer::emitKernelArgs(const Function &Func) { @@ -261,12 +266,21 @@ void MetadataStreamer::emitKernelArgs(const Function &Func) { emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY); emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ); - if (!Func.getParent()->getNamedMetadata("llvm.printf.fmts")) - return; - auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(), AMDGPUASI.GLOBAL_ADDRESS); - emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer); + auto CallsPrintf = Func.getParent()->getNamedMetadata("llvm.printf.fmts"); + if (CallsPrintf) + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer); + if (Func.hasFnAttribute("calls-enqueue-kernel")) { + if (!CallsPrintf) { + // Emit a dummy argument so that the remaining hidden arguments + // have a fixed position relative to the first hidden argument. + // This is to facilitate library code to access hidden arguments. + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone); + } + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue); + emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction); + } } void MetadataStreamer::emitKernelArg(const Argument &Arg) { @@ -274,10 +288,17 @@ void MetadataStreamer::emitKernelArg(const Argument &Arg) { auto ArgNo = Arg.getArgNo(); const MDNode *Node; - StringRef TypeQual; - Node = Func->getMetadata("kernel_arg_type_qual"); + StringRef Name; + Node = Func->getMetadata("kernel_arg_name"); if (Node && ArgNo < Node->getNumOperands()) - TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString(); + Name = cast<MDString>(Node->getOperand(ArgNo))->getString(); + else if (Arg.hasName()) + Name = Arg.getName(); + + StringRef TypeName; + Node = Func->getMetadata("kernel_arg_type"); + if (Node && ArgNo < Node->getNumOperands()) + TypeName = cast<MDString>(Node->getOperand(ArgNo))->getString(); StringRef BaseTypeName; Node = Func->getMetadata("kernel_arg_base_type"); @@ -294,28 +315,25 @@ void MetadataStreamer::emitKernelArg(const Argument &Arg) { AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString(); } - StringRef Name; - Node = Func->getMetadata("kernel_arg_name"); - if (Node && ArgNo < Node->getNumOperands()) - Name = cast<MDString>(Node->getOperand(ArgNo))->getString(); - - StringRef TypeName; - Node = Func->getMetadata("kernel_arg_type"); + StringRef TypeQual; + Node = Func->getMetadata("kernel_arg_type_qual"); if (Node && ArgNo < Node->getNumOperands()) - TypeName = cast<MDString>(Node->getOperand(ArgNo))->getString(); + TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString(); emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(), - getValueKind(Arg.getType(), TypeQual, BaseTypeName), TypeQual, - BaseTypeName, AccQual, Name, TypeName); + getValueKind(Arg.getType(), TypeQual, BaseTypeName), Name, + TypeName, BaseTypeName, AccQual, TypeQual); } void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty, - ValueKind ValueKind, StringRef TypeQual, - StringRef BaseTypeName, StringRef AccQual, - StringRef Name, StringRef TypeName) { - CodeObjectMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata()); - auto &Arg = CodeObjectMetadata.mKernels.back().mArgs.back(); + ValueKind ValueKind, StringRef Name, + StringRef TypeName, StringRef BaseTypeName, + StringRef AccQual, StringRef TypeQual) { + HSAMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata()); + auto &Arg = HSAMetadata.mKernels.back().mArgs.back(); + Arg.mName = Name; + Arg.mTypeName = TypeName; Arg.mSize = DL.getTypeAllocSize(Ty); Arg.mAlign = DL.getABITypeAlignment(Ty); Arg.mValueKind = ValueKind; @@ -327,62 +345,25 @@ void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty, Arg.mPointeeAlign = DL.getABITypeAlignment(ElTy); } - Arg.mAccQual = getAccessQualifier(AccQual); - if (auto PtrTy = dyn_cast<PointerType>(Ty)) Arg.mAddrSpaceQual = getAddressSpaceQualifer(PtrTy->getAddressSpace()); + Arg.mAccQual = getAccessQualifier(AccQual); + + // TODO: Emit Arg.mActualAccQual. + SmallVector<StringRef, 1> SplitTypeQuals; TypeQual.split(SplitTypeQuals, " ", -1, false); for (StringRef Key : SplitTypeQuals) { auto P = StringSwitch<bool*>(Key) .Case("const", &Arg.mIsConst) - .Case("pipe", &Arg.mIsPipe) .Case("restrict", &Arg.mIsRestrict) .Case("volatile", &Arg.mIsVolatile) + .Case("pipe", &Arg.mIsPipe) .Default(nullptr); if (P) *P = true; } - - Arg.mName = Name; - Arg.mTypeName = TypeName; -} - -void MetadataStreamer::emitKernelCodeProps( - const amd_kernel_code_t &KernelCode) { - auto &CodeProps = CodeObjectMetadata.mKernels.back().mCodeProps; - - CodeProps.mKernargSegmentSize = KernelCode.kernarg_segment_byte_size; - CodeProps.mWorkgroupGroupSegmentSize = - KernelCode.workgroup_group_segment_byte_size; - CodeProps.mWorkitemPrivateSegmentSize = - KernelCode.workitem_private_segment_byte_size; - CodeProps.mWavefrontNumSGPRs = KernelCode.wavefront_sgpr_count; - CodeProps.mWorkitemNumVGPRs = KernelCode.workitem_vgpr_count; - CodeProps.mKernargSegmentAlign = KernelCode.kernarg_segment_alignment; - CodeProps.mGroupSegmentAlign = KernelCode.group_segment_alignment; - CodeProps.mPrivateSegmentAlign = KernelCode.private_segment_alignment; - CodeProps.mWavefrontSize = KernelCode.wavefront_size; -} - -void MetadataStreamer::emitKernelDebugProps( - const amd_kernel_code_t &KernelCode) { - if (!(KernelCode.code_properties & AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED)) - return; - - auto &DebugProps = CodeObjectMetadata.mKernels.back().mDebugProps; - - // FIXME: Need to pass down debugger ABI version through features. This is ok - // for now because we only have one version. - DebugProps.mDebuggerABIVersion.push_back(1); - DebugProps.mDebuggerABIVersion.push_back(0); - DebugProps.mReservedNumVGPRs = KernelCode.reserved_vgpr_count; - DebugProps.mReservedFirstVGPR = KernelCode.reserved_vgpr_first; - DebugProps.mPrivateSegmentBufferSGPR = - KernelCode.debug_private_segment_buffer_sgpr; - DebugProps.mWavefrontPrivateSegmentOffsetSGPR = - KernelCode.debug_wavefront_private_segment_offset_sgpr; } void MetadataStreamer::begin(const Module &Mod) { @@ -391,42 +372,36 @@ void MetadataStreamer::begin(const Module &Mod) { emitPrintf(Mod); } -void MetadataStreamer::emitKernel(const Function &Func, - const amd_kernel_code_t &KernelCode) { +void MetadataStreamer::end() { + std::string HSAMetadataString; + if (toString(HSAMetadata, HSAMetadataString)) + return; + + if (DumpHSAMetadata) + dump(HSAMetadataString); + if (VerifyHSAMetadata) + verify(HSAMetadataString); +} + +void MetadataStreamer::emitKernel( + const Function &Func, + const Kernel::CodeProps::Metadata &CodeProps, + const Kernel::DebugProps::Metadata &DebugProps) { if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL) return; - CodeObjectMetadata.mKernels.push_back(Kernel::Metadata()); - auto &Kernel = CodeObjectMetadata.mKernels.back(); + HSAMetadata.mKernels.push_back(Kernel::Metadata()); + auto &Kernel = HSAMetadata.mKernels.back(); Kernel.mName = Func.getName(); + Kernel.mSymbolName = (Twine(Func.getName()) + Twine("@kd")).str(); emitKernelLanguage(Func); emitKernelAttrs(Func); emitKernelArgs(Func); - emitKernelCodeProps(KernelCode); - emitKernelDebugProps(KernelCode); -} - -ErrorOr<std::string> MetadataStreamer::toYamlString() { - std::string YamlString; - if (auto Error = Metadata::toYamlString(CodeObjectMetadata, YamlString)) - return Error; - - if (DumpCodeObjectMetadata) - dump(YamlString); - if (VerifyCodeObjectMetadata) - verify(YamlString); - - return YamlString; -} - -ErrorOr<std::string> MetadataStreamer::toYamlString(StringRef YamlString) { - if (auto Error = Metadata::fromYamlString(YamlString, CodeObjectMetadata)) - return Error; - - return toYamlString(); + HSAMetadata.mKernels.back().mCodeProps = CodeProps; + HSAMetadata.mKernels.back().mDebugProps = DebugProps; } -} // end namespace CodeObject +} // end namespace HSAMD } // end namespace AMDGPU } // end namespace llvm diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h index c6681431d74d..bd6515521a74 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h @@ -1,4 +1,4 @@ -//===--- AMDGPUCodeObjectMetadataStreamer.h ---------------------*- C++ -*-===// +//===--- AMDGPUHSAMetadataStreamer.h ----------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -8,19 +8,18 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief AMDGPU Code Object Metadata Streamer. +/// \brief AMDGPU HSA Metadata Streamer. /// // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H -#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H #include "AMDGPU.h" #include "AMDKernelCodeT.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Support/AMDGPUCodeObjectMetadata.h" -#include "llvm/Support/ErrorOr.h" +#include "llvm/Support/AMDGPUMetadata.h" namespace llvm { @@ -32,16 +31,16 @@ class Module; class Type; namespace AMDGPU { -namespace CodeObject { +namespace HSAMD { class MetadataStreamer final { private: - Metadata CodeObjectMetadata; + Metadata HSAMetadata; AMDGPUAS AMDGPUASI; - void dump(StringRef YamlString) const; + void dump(StringRef HSAMetadataString) const; - void verify(StringRef YamlString) const; + void verify(StringRef HSAMetadataString) const; AccessQualifier getAccessQualifier(StringRef AccQual) const; @@ -69,31 +68,29 @@ private: void emitKernelArg(const Argument &Arg); void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind, - StringRef TypeQual = "", StringRef BaseTypeName = "", - StringRef AccQual = "", StringRef Name = "", - StringRef TypeName = ""); - - void emitKernelCodeProps(const amd_kernel_code_t &KernelCode); - - void emitKernelDebugProps(const amd_kernel_code_t &KernelCode); + StringRef Name = "", StringRef TypeName = "", + StringRef BaseTypeName = "", StringRef AccQual = "", + StringRef TypeQual = ""); public: MetadataStreamer() = default; ~MetadataStreamer() = default; - void begin(const Module &Mod); + const Metadata &getHSAMetadata() const { + return HSAMetadata; + } - void end() {} - - void emitKernel(const Function &Func, const amd_kernel_code_t &KernelCode); + void begin(const Module &Mod); - ErrorOr<std::string> toYamlString(); + void end(); - ErrorOr<std::string> toYamlString(StringRef YamlString); + void emitKernel(const Function &Func, + const Kernel::CodeProps::Metadata &CodeProps, + const Kernel::DebugProps::Metadata &DebugProps); }; -} // end namespace CodeObject +} // end namespace HSAMD } // end namespace AMDGPU } // end namespace llvm -#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H +#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 136e6ec4ceb5..2b321c04fb30 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -18,6 +18,8 @@ #include "AMDGPUTargetStreamer.h" #include "InstPrinter/AMDGPUInstPrinter.h" #include "SIDefines.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" @@ -60,7 +62,8 @@ static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T, const MCAsmInfo &MAI, const MCInstrInfo &MII, const MCRegisterInfo &MRI) { - return new AMDGPUInstPrinter(MAI, MII, MRI); + return T.getArch() == Triple::r600 ? new R600InstPrinter(MAI, MII, MRI) : + new AMDGPUInstPrinter(MAI, MII, MRI); } static MCTargetStreamer *createAMDGPUAsmTargetStreamer(MCStreamer &S, @@ -77,12 +80,12 @@ static MCTargetStreamer * createAMDGPUObjectTargetStreamer( } static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context, - MCAsmBackend &MAB, raw_pwrite_stream &OS, - MCCodeEmitter *Emitter, bool RelaxAll) { - if (T.getOS() == Triple::AMDHSA) - return createAMDGPUELFStreamer(Context, MAB, OS, Emitter, RelaxAll); - - return createELFStreamer(Context, MAB, OS, Emitter, RelaxAll); + std::unique_ptr<MCAsmBackend> &&MAB, + raw_pwrite_stream &OS, + std::unique_ptr<MCCodeEmitter> &&Emitter, + bool RelaxAll) { + return createAMDGPUELFStreamer(T, Context, std::move(MAB), OS, + std::move(Emitter), RelaxAll); } extern "C" void LLVMInitializeAMDGPUTargetMC() { diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index f80b5f3a6dba..0b3563303ad0 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -18,6 +18,8 @@ #include "llvm/Support/DataTypes.h" +#include <memory> + namespace llvm { class MCAsmBackend; class MCCodeEmitter; @@ -47,9 +49,9 @@ MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI, const Triple &TT, StringRef CPU, const MCTargetOptions &Options); -MCObjectWriter *createAMDGPUELFObjectWriter(bool Is64Bit, - bool HasRelocationAddend, - raw_pwrite_stream &OS); +std::unique_ptr<MCObjectWriter> +createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, + bool HasRelocationAddend, raw_pwrite_stream &OS); } // End llvm namespace #define GET_REGINFO_ENUM @@ -58,7 +60,9 @@ MCObjectWriter *createAMDGPUELFObjectWriter(bool Is64Bit, #define GET_INSTRINFO_ENUM #define GET_INSTRINFO_OPERAND_ENUM +#define GET_INSTRINFO_SCHED_ENUM #include "AMDGPUGenInstrInfo.inc" +#undef GET_INSTRINFO_SCHED_ENUM #undef GET_INSTRINFO_OPERAND_ENUM #undef GET_INSTRINFO_ENUM diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 2a0032fc9adc..d897956daccf 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -39,21 +39,12 @@ using namespace llvm::AMDGPU; // AMDGPUTargetStreamer //===----------------------------------------------------------------------===// -AMDGPUTargetStreamer::AMDGPUTargetStreamer(MCStreamer &S) - : MCTargetStreamer(S) {} - -void AMDGPUTargetStreamer::EmitStartOfCodeObjectMetadata(const Module &Mod) { - CodeObjectMetadataStreamer.begin(Mod); -} - -void AMDGPUTargetStreamer::EmitKernelCodeObjectMetadata( - const Function &Func, const amd_kernel_code_t &KernelCode) { - CodeObjectMetadataStreamer.emitKernel(Func, KernelCode); -} +bool AMDGPUTargetStreamer::EmitHSAMetadata(StringRef HSAMetadataString) { + HSAMD::Metadata HSAMetadata; + if (HSAMD::fromString(HSAMetadataString, HSAMetadata)) + return false; -void AMDGPUTargetStreamer::EmitEndOfCodeObjectMetadata() { - CodeObjectMetadataStreamer.end(); - EmitCodeObjectMetadata(CodeObjectMetadataStreamer.toYamlString().get()); + return EmitHSAMetadata(HSAMetadata); } //===----------------------------------------------------------------------===// @@ -100,15 +91,30 @@ void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, } } -bool AMDGPUTargetAsmStreamer::EmitCodeObjectMetadata(StringRef YamlString) { - auto VerifiedYamlString = CodeObjectMetadataStreamer.toYamlString(YamlString); - if (!VerifiedYamlString) +bool AMDGPUTargetAsmStreamer::EmitISAVersion(StringRef IsaVersionString) { + OS << "\t.amd_amdgpu_isa \"" << IsaVersionString << "\"\n"; + return true; +} + +bool AMDGPUTargetAsmStreamer::EmitHSAMetadata( + const AMDGPU::HSAMD::Metadata &HSAMetadata) { + std::string HSAMetadataString; + if (HSAMD::toString(HSAMetadata, HSAMetadataString)) return false; - OS << '\t' << AMDGPU::CodeObject::MetadataAssemblerDirectiveBegin << '\n'; - OS << VerifiedYamlString.get(); - OS << '\t' << AMDGPU::CodeObject::MetadataAssemblerDirectiveEnd << '\n'; + OS << '\t' << HSAMD::AssemblerDirectiveBegin << '\n'; + OS << HSAMetadataString << '\n'; + OS << '\t' << HSAMD::AssemblerDirectiveEnd << '\n'; + return true; +} +bool AMDGPUTargetAsmStreamer::EmitPALMetadata( + const PALMD::Metadata &PALMetadata) { + std::string PALMetadataString; + if (PALMD::toString(PALMetadata, PALMetadataString)) + return false; + + OS << '\t' << PALMD::AssemblerDirective << PALMetadataString << '\n'; return true; } @@ -124,7 +130,7 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() { } void AMDGPUTargetELFStreamer::EmitAMDGPUNote( - const MCExpr *DescSZ, ElfNote::NoteType Type, + const MCExpr *DescSZ, unsigned NoteType, function_ref<void(MCELFStreamer &)> EmitDesc) { auto &S = getStreamer(); auto &Context = S.getContext(); @@ -136,7 +142,7 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUNote( ElfNote::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC)); S.EmitIntValue(NameSZ, 4); // namesz S.EmitValue(DescSZ, 4); // descz - S.EmitIntValue(Type, 4); // type + S.EmitIntValue(NoteType, 4); // type S.EmitBytes(StringRef(ElfNote::NoteName, NameSZ)); // name S.EmitValueToAlignment(4, 0, 1, 0); // padding 0 EmitDesc(S); // desc @@ -204,9 +210,32 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL); } -bool AMDGPUTargetELFStreamer::EmitCodeObjectMetadata(StringRef YamlString) { - auto VerifiedYamlString = CodeObjectMetadataStreamer.toYamlString(YamlString); - if (!VerifiedYamlString) +bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) { + // Create two labels to mark the beginning and end of the desc field + // and a MCExpr to calculate the size of the desc field. + auto &Context = getContext(); + auto *DescBegin = Context.createTempSymbol(); + auto *DescEnd = Context.createTempSymbol(); + auto *DescSZ = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(DescEnd, Context), + MCSymbolRefExpr::create(DescBegin, Context), Context); + + EmitAMDGPUNote( + DescSZ, + ELF::NT_AMD_AMDGPU_ISA, + [&](MCELFStreamer &OS) { + OS.EmitLabel(DescBegin); + OS.EmitBytes(IsaVersionString); + OS.EmitLabel(DescEnd); + } + ); + return true; +} + +bool AMDGPUTargetELFStreamer::EmitHSAMetadata( + const AMDGPU::HSAMD::Metadata &HSAMetadata) { + std::string HSAMetadataString; + if (HSAMD::toString(HSAMetadata, HSAMetadataString)) return false; // Create two labels to mark the beginning and end of the desc field @@ -220,13 +249,25 @@ bool AMDGPUTargetELFStreamer::EmitCodeObjectMetadata(StringRef YamlString) { EmitAMDGPUNote( DescSZ, - ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_METADATA, + ELF::NT_AMD_AMDGPU_HSA_METADATA, [&](MCELFStreamer &OS) { OS.EmitLabel(DescBegin); - OS.EmitBytes(VerifiedYamlString.get()); + OS.EmitBytes(HSAMetadataString); OS.EmitLabel(DescEnd); } ); + return true; +} +bool AMDGPUTargetELFStreamer::EmitPALMetadata( + const PALMD::Metadata &PALMetadata) { + EmitAMDGPUNote( + MCConstantExpr::create(PALMetadata.size() * sizeof(uint32_t), getContext()), + ELF::NT_AMD_AMDGPU_PAL_METADATA, + [&](MCELFStreamer &OS){ + for (auto I : PALMetadata) + OS.EmitIntValue(I, sizeof(uint32_t)); + } + ); return true; } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index 968128e94d0b..0919b754480d 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -10,9 +10,10 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H -#include "AMDGPUCodeObjectMetadataStreamer.h" #include "AMDKernelCodeT.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/AMDGPUMetadata.h" namespace llvm { #include "AMDGPUPTNote.h" @@ -27,11 +28,11 @@ class Type; class AMDGPUTargetStreamer : public MCTargetStreamer { protected: - AMDGPU::CodeObject::MetadataStreamer CodeObjectMetadataStreamer; MCContext &getContext() const { return Streamer.getContext(); } public: - AMDGPUTargetStreamer(MCStreamer &S); + AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {} + virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major, uint32_t Minor) = 0; @@ -44,15 +45,17 @@ public: virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0; - virtual void EmitStartOfCodeObjectMetadata(const Module &Mod); + /// \returns True on success, false on failure. + virtual bool EmitISAVersion(StringRef IsaVersionString) = 0; - virtual void EmitKernelCodeObjectMetadata( - const Function &Func, const amd_kernel_code_t &KernelCode); + /// \returns True on success, false on failure. + virtual bool EmitHSAMetadata(StringRef HSAMetadataString); - virtual void EmitEndOfCodeObjectMetadata(); + /// \returns True on success, false on failure. + virtual bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) = 0; /// \returns True on success, false on failure. - virtual bool EmitCodeObjectMetadata(StringRef YamlString) = 0; + virtual bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) = 0; }; class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer { @@ -71,14 +74,19 @@ public: void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; /// \returns True on success, false on failure. - bool EmitCodeObjectMetadata(StringRef YamlString) override; + bool EmitISAVersion(StringRef IsaVersionString) override; + + /// \returns True on success, false on failure. + bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override; + + /// \returns True on success, false on failure. + bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override; }; class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer { MCStreamer &Streamer; - void EmitAMDGPUNote(const MCExpr *DescSize, - AMDGPU::ElfNote::NoteType Type, + void EmitAMDGPUNote(const MCExpr *DescSize, unsigned NoteType, function_ref<void(MCELFStreamer &)> EmitDesc); public: @@ -98,7 +106,13 @@ public: void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; /// \returns True on success, false on failure. - bool EmitCodeObjectMetadata(StringRef YamlString) override; + bool EmitISAVersion(StringRef IsaVersionString) override; + + /// \returns True on success, false on failure. + bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override; + + /// \returns True on success, false on failure. + bool EmitPALMetadata(const AMDGPU::PALMD::Metadata &PALMetadata) override; }; } diff --git a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt index 09e3efad10af..f9cb4678dc51 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt +++ b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt @@ -1,8 +1,8 @@ add_llvm_library(LLVMAMDGPUDesc AMDGPUAsmBackend.cpp - AMDGPUCodeObjectMetadataStreamer.cpp AMDGPUELFObjectWriter.cpp AMDGPUELFStreamer.cpp + AMDGPUHSAMetadataStreamer.cpp AMDGPUMCAsmInfo.cpp AMDGPUMCCodeEmitter.cpp AMDGPUMCTargetDesc.cpp diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 376c9bfe5ccf..94c0157edeb5 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -278,7 +278,7 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, return; // Check for additional literals in SRC0/1/2 (Op 1/2/3) - for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) { + for (unsigned i = 0, e = Desc.getNumOperands(); i < e; ++i) { // Check if this operand should be encoded as [SV]Src if (!AMDGPU::isSISrcOperand(Desc, i)) diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td index 06e2c11b0193..30a2df510386 100644 --- a/lib/Target/AMDGPU/MIMGInstructions.td +++ b/lib/Target/AMDGPU/MIMGInstructions.td @@ -63,13 +63,13 @@ multiclass MIMG_NoSampler <bits<7> op, string asm> { class MIMG_Store_Helper <bits<7> op, string asm, RegisterClass data_rc, - RegisterClass addr_rc> : MIMG_Helper < + RegisterClass addr_rc, + string dns = ""> : MIMG_Helper < (outs), (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, dmask:$dmask, unorm:$unorm, GLC:$glc, slc:$slc, r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), - asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" - >, MIMGe<op> { + asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da", dns>, MIMGe<op> { let ssamp = 0; let mayLoad = 1; // TableGen requires this for matching with the intrinsics let mayStore = 1; @@ -81,7 +81,8 @@ class MIMG_Store_Helper <bits<7> op, string asm, multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm, RegisterClass data_rc, int channels> { - def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32>, + def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32, + !if(!eq(channels, 1), "AMDGPU", "")>, MIMG_Mask<asm#"_V1", channels>; def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>, MIMG_Mask<asm#"_V2", channels>; @@ -257,7 +258,11 @@ defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">; defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">; //def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>; //def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>; + +let mayLoad = 0, mayStore = 0 in { defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">; +} + defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">; defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", VReg_64>; defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">; @@ -331,7 +336,11 @@ defm IMAGE_GATHER4_C_L_O : MIMG_Gather <0x0000005c, "image_gather4_c_l_o">; defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <0x0000005d, "image_gather4_c_b_o">; defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, "image_gather4_c_b_cl_o">; defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <0x0000005f, "image_gather4_c_lz_o">; + +let mayLoad = 0, mayStore = 0 in { defm IMAGE_GET_LOD : MIMG_Sampler_WQM <0x00000060, "image_get_lod">; +} + defm IMAGE_SAMPLE_CD : MIMG_Sampler <0x00000068, "image_sample_cd">; defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <0x00000069, "image_sample_cd_cl">; defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <0x0000006a, "image_sample_c_cd">; @@ -349,7 +358,7 @@ defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o" /********** ======================= **********/ // Image + sampler -class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < +class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat < (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm, i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), (opcode $addr, $rsrc, $sampler, @@ -371,7 +380,7 @@ multiclass SampleRawPatterns<SDPatternOperator name, string opcode> { // 2. Handle v4i32 rsrc type (Register Class for the instruction to be SReg_128). // 3. Add A16 support when we pass address of half type. multiclass AMDGCNSamplePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> { - def : Pat< + def : GCNPat< (dt (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i1:$unorm, i1:$glc, i1:$slc, i1:$lwe, i1:$da)), (opcode $addr, $rsrc, $sampler, @@ -396,7 +405,7 @@ multiclass AMDGCNSamplePatterns<SDPatternOperator name, string opcode> { } // Image only -class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < +class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat < (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm, imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe), (opcode $addr, $rsrc, @@ -411,7 +420,7 @@ multiclass ImagePatterns<SDPatternOperator name, string opcode> { } multiclass ImageLoadPattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> { - def : Pat < + def : GCNPat < (dt (name vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe, i1:$da)), (opcode $addr, $rsrc, @@ -434,7 +443,7 @@ multiclass ImageLoadPatterns<SDPatternOperator name, string opcode> { } multiclass ImageStorePattern<SDPatternOperator name, MIMG opcode, ValueType dt, ValueType vt> { - def : Pat < + def : GCNPat < (name dt:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, i1:$glc, i1:$slc, i1:$lwe, i1:$da), (opcode $data, $addr, $rsrc, @@ -456,7 +465,7 @@ multiclass ImageStorePatterns<SDPatternOperator name, string opcode> { defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V4), v4f32>; } -class ImageAtomicPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < +class ImageAtomicPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : GCNPat < (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc), (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)) >; @@ -467,7 +476,7 @@ multiclass ImageAtomicPatterns<SDPatternOperator name, string opcode> { def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V4), v4i32>; } -class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : Pat < +class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : GCNPat < (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc), (EXTRACT_SUBREG @@ -584,34 +593,34 @@ defm : ImageAtomicPatterns<int_amdgcn_image_atomic_inc, "IMAGE_ATOMIC_INC">; defm : ImageAtomicPatterns<int_amdgcn_image_atomic_dec, "IMAGE_ATOMIC_DEC">; /* SIsample for simple 1D texture lookup */ -def : Pat < +def : GCNPat < (SIsample i32:$addr, v8i32:$rsrc, v4i32:$sampler, imm), (IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) >; -class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat < +class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat < (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, imm), (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) >; -class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat < +class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat < (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_RECT), (opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0) >; -class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat < +class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : GCNPat < (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_ARRAY), (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1) >; class SampleShadowPattern<SDNode name, MIMG opcode, - ValueType vt> : Pat < + ValueType vt> : GCNPat < (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW), (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) >; class SampleShadowArrayPattern<SDNode name, MIMG opcode, - ValueType vt> : Pat < + ValueType vt> : GCNPat < (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY), (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1) >; diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td index d30d1d382588..d50dae78e247 100644 --- a/lib/Target/AMDGPU/Processors.td +++ b/lib/Target/AMDGPU/Processors.td @@ -1,4 +1,4 @@ -//===-- Processors.td - R600 Processor definitions ------------------------===// +//===-- Processors.td - AMDGPU Processor definitions ----------------------===// // // The LLVM Compiler Infrastructure // @@ -7,217 +7,6 @@ // //===----------------------------------------------------------------------===// -class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features> -: Processor<Name, itin, Features>; - -//===----------------------------------------------------------------------===// -// R600 -//===----------------------------------------------------------------------===// -def : Proc<"r600", R600_VLIW5_Itin, - [FeatureR600, FeatureVertexCache, FeatureWavefrontSize64]>; - -def : Proc<"r630", R600_VLIW5_Itin, - [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>; - -def : Proc<"rs880", R600_VLIW5_Itin, - [FeatureR600, FeatureWavefrontSize16]>; - -def : Proc<"rv670", R600_VLIW5_Itin, - [FeatureR600, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>; - -//===----------------------------------------------------------------------===// -// R700 -//===----------------------------------------------------------------------===// - -def : Proc<"rv710", R600_VLIW5_Itin, - [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>; - -def : Proc<"rv730", R600_VLIW5_Itin, - [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>; - -def : Proc<"rv770", R600_VLIW5_Itin, - [FeatureR700, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>; - -//===----------------------------------------------------------------------===// -// Evergreen -//===----------------------------------------------------------------------===// - -def : Proc<"cedar", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32, - FeatureCFALUBug]>; - -def : Proc<"redwood", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64, - FeatureCFALUBug]>; - -def : Proc<"sumo", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug]>; - -def : Proc<"juniper", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>; - -def : Proc<"cypress", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureFP64, FeatureVertexCache, - FeatureWavefrontSize64]>; - -//===----------------------------------------------------------------------===// -// Northern Islands -//===----------------------------------------------------------------------===// - -def : Proc<"barts", R600_VLIW5_Itin, - [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>; - -def : Proc<"turks", R600_VLIW5_Itin, - [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>; - -def : Proc<"caicos", R600_VLIW5_Itin, - [FeatureNorthernIslands, FeatureCFALUBug]>; - -def : Proc<"cayman", R600_VLIW4_Itin, - [FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>; - -//===----------------------------------------------------------------------===// -// Southern Islands -//===----------------------------------------------------------------------===// - -def : ProcessorModel<"gfx600", SIFullSpeedModel, - [FeatureISAVersion6_0_0]>; - -def : ProcessorModel<"SI", SIFullSpeedModel, - [FeatureISAVersion6_0_0] ->; - -def : ProcessorModel<"tahiti", SIFullSpeedModel, - [FeatureISAVersion6_0_0] ->; - -def : ProcessorModel<"gfx601", SIQuarterSpeedModel, - [FeatureISAVersion6_0_1] ->; - -def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, - [FeatureISAVersion6_0_1]>; - -def : ProcessorModel<"verde", SIQuarterSpeedModel, - [FeatureISAVersion6_0_1]>; - -def : ProcessorModel<"oland", SIQuarterSpeedModel, - [FeatureISAVersion6_0_1]>; - -def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureISAVersion6_0_1]>; - -//===----------------------------------------------------------------------===// -// Sea Islands -//===----------------------------------------------------------------------===// - -def : ProcessorModel<"gfx700", SIQuarterSpeedModel, - [FeatureISAVersion7_0_0] ->; - -def : ProcessorModel<"bonaire", SIQuarterSpeedModel, - [FeatureISAVersion7_0_0] ->; - -def : ProcessorModel<"kaveri", SIQuarterSpeedModel, - [FeatureISAVersion7_0_0] ->; - -def : ProcessorModel<"gfx701", SIFullSpeedModel, - [FeatureISAVersion7_0_1] ->; - -def : ProcessorModel<"hawaii", SIFullSpeedModel, - [FeatureISAVersion7_0_1] ->; - -def : ProcessorModel<"gfx702", SIQuarterSpeedModel, - [FeatureISAVersion7_0_2] ->; - -def : ProcessorModel<"gfx703", SIQuarterSpeedModel, - [FeatureISAVersion7_0_3] ->; - -def : ProcessorModel<"kabini", SIQuarterSpeedModel, - [FeatureISAVersion7_0_3] ->; - -def : ProcessorModel<"mullins", SIQuarterSpeedModel, - [FeatureISAVersion7_0_3]>; - -//===----------------------------------------------------------------------===// -// Volcanic Islands -//===----------------------------------------------------------------------===// - -def : ProcessorModel<"tonga", SIQuarterSpeedModel, - [FeatureISAVersion8_0_2] ->; - -def : ProcessorModel<"iceland", SIQuarterSpeedModel, - [FeatureISAVersion8_0_0] ->; - -def : ProcessorModel<"carrizo", SIQuarterSpeedModel, - [FeatureISAVersion8_0_1] ->; - -def : ProcessorModel<"fiji", SIQuarterSpeedModel, - [FeatureISAVersion8_0_3] ->; - -def : ProcessorModel<"stoney", SIQuarterSpeedModel, - [FeatureISAVersion8_1_0] ->; - -def : ProcessorModel<"polaris10", SIQuarterSpeedModel, - [FeatureISAVersion8_0_3] ->; - -def : ProcessorModel<"polaris11", SIQuarterSpeedModel, - [FeatureISAVersion8_0_3] ->; - -def : ProcessorModel<"gfx800", SIQuarterSpeedModel, - [FeatureISAVersion8_0_0] ->; - -def : ProcessorModel<"gfx801", SIQuarterSpeedModel, - [FeatureISAVersion8_0_1] ->; - -def : ProcessorModel<"gfx802", SIQuarterSpeedModel, - [FeatureISAVersion8_0_2] ->; - -def : ProcessorModel<"gfx803", SIQuarterSpeedModel, - [FeatureISAVersion8_0_3] ->; - -def : ProcessorModel<"gfx804", SIQuarterSpeedModel, - [FeatureISAVersion8_0_4] ->; - -def : ProcessorModel<"gfx810", SIQuarterSpeedModel, - [FeatureISAVersion8_1_0] ->; - -//===----------------------------------------------------------------------===// -// GFX9 -//===----------------------------------------------------------------------===// - -def : ProcessorModel<"gfx900", SIQuarterSpeedModel, - [FeatureISAVersion9_0_0] ->; - -def : ProcessorModel<"gfx901", SIQuarterSpeedModel, - [FeatureISAVersion9_0_1] ->; - -def : ProcessorModel<"gfx902", SIQuarterSpeedModel, - [FeatureISAVersion9_0_2] ->; - -def : ProcessorModel<"gfx903", SIQuarterSpeedModel, - [FeatureISAVersion9_0_3] ->; - +FIXME: Deleting this file broke buildbots that don't do full rebuilds. This +file is no longer used by the backend, so it can be deleted once all +the buildbots update there dependencies. diff --git a/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/lib/Target/AMDGPU/R600ClauseMergePass.cpp index fbe45cb222d9..5e1ba6b506da 100644 --- a/lib/Target/AMDGPU/R600ClauseMergePass.cpp +++ b/lib/Target/AMDGPU/R600ClauseMergePass.cpp @@ -44,7 +44,6 @@ static bool isCFAlu(const MachineInstr &MI) { class R600ClauseMergePass : public MachineFunctionPass { private: - static char ID; const R600InstrInfo *TII; unsigned getCFAluSize(const MachineInstr &MI) const; @@ -62,6 +61,8 @@ private: const MachineInstr &LatrCFAlu) const; public: + static char ID; + R600ClauseMergePass() : MachineFunctionPass(ID) { } bool runOnMachineFunction(MachineFunction &MF) override; @@ -69,8 +70,17 @@ public: StringRef getPassName() const override; }; +} // end anonymous namespace + +INITIALIZE_PASS_BEGIN(R600ClauseMergePass, DEBUG_TYPE, + "R600 Clause Merge", false, false) +INITIALIZE_PASS_END(R600ClauseMergePass, DEBUG_TYPE, + "R600 Clause Merge", false, false) + char R600ClauseMergePass::ID = 0; +char &llvm::R600ClauseMergePassID = R600ClauseMergePass::ID; + unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr &MI) const { assert(isCFAlu(MI)); return MI @@ -170,7 +180,7 @@ bool R600ClauseMergePass::mergeIfPossible(MachineInstr &RootCFAlu, } bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>(); @@ -205,9 +215,6 @@ StringRef R600ClauseMergePass::getPassName() const { return "R600 Merge Clause Markers Pass"; } -} // end anonymous namespace - - llvm::FunctionPass *llvm::createR600ClauseMergePass() { return new R600ClauseMergePass(); } diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index 00cbd24b84fb..0e788df1c9c0 100644 --- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -1,4 +1,4 @@ -//===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===// +//===- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst ----------===// // // The LLVM Compiler Infrastructure // @@ -9,7 +9,8 @@ // /// \file /// This pass compute turns all control flow pseudo instructions into native one -/// computing their address on the fly ; it also sets STACK_SIZE info. +/// computing their address on the fly; it also sets STACK_SIZE info. +// //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -29,13 +30,15 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DebugLoc.h" +#include "llvm/IR/Function.h" +#include "llvm/Pass.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> #include <cassert> #include <cstdint> -#include <new> #include <set> #include <utility> #include <vector> @@ -47,7 +50,6 @@ using namespace llvm; namespace { struct CFStack { - enum StackItem { ENTRY = 0, SUB_ENTRY = 1, @@ -214,7 +216,7 @@ void CFStack::popLoop() { class R600ControlFlowFinalizer : public MachineFunctionPass { private: - typedef std::pair<MachineInstr *, std::vector<MachineInstr *>> ClauseFile; + using ClauseFile = std::pair<MachineInstr *, std::vector<MachineInstr *>>; enum ControlFlowInstruction { CF_TC, @@ -230,7 +232,6 @@ private: CF_END }; - static char ID; const R600InstrInfo *TII = nullptr; const R600RegisterInfo *TRI = nullptr; unsigned MaxFetchInst; @@ -499,6 +500,8 @@ private: } public: + static char ID; + R600ControlFlowFinalizer() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override { @@ -509,14 +512,14 @@ public: R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); - CFStack CFStack(ST, MF.getFunction()->getCallingConv()); + CFStack CFStack(ST, MF.getFunction().getCallingConv()); for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; ++MB) { MachineBasicBlock &MBB = *MB; unsigned CfCount = 0; std::vector<std::pair<unsigned, std::set<MachineInstr *>>> LoopStack; std::vector<MachineInstr * > IfThenElseStack; - if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) { + if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_VS) { BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), getHWInstrDesc(CF_CALL_FS)); CfCount++; @@ -702,9 +705,16 @@ public: } }; +} // end anonymous namespace + +INITIALIZE_PASS_BEGIN(R600ControlFlowFinalizer, DEBUG_TYPE, + "R600 Control Flow Finalizer", false, false) +INITIALIZE_PASS_END(R600ControlFlowFinalizer, DEBUG_TYPE, + "R600 Control Flow Finalizer", false, false) + char R600ControlFlowFinalizer::ID = 0; -} // end anonymous namespace +char &llvm::R600ControlFlowFinalizerID = R600ControlFlowFinalizer::ID; FunctionPass *llvm::createR600ControlFlowFinalizer() { return new R600ControlFlowFinalizer(); diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp index 66def2d29caf..ffea231ee4d0 100644 --- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -1,4 +1,4 @@ -//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===// +//===- R600ExpandSpecialInstrs.cpp - Expand special instructions ----------===// // // The LLVM Compiler Infrastructure // @@ -18,27 +18,35 @@ #include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600InstrInfo.h" -#include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/Pass.h" +#include <cassert> +#include <cstdint> +#include <iterator> using namespace llvm; +#define DEBUG_TYPE "r600-expand-special-instrs" + namespace { class R600ExpandSpecialInstrsPass : public MachineFunctionPass { private: - static char ID; - const R600InstrInfo *TII; + const R600InstrInfo *TII = nullptr; void SetFlagInNewMI(MachineInstr *NewMI, const MachineInstr *OldMI, unsigned Op); public: - R600ExpandSpecialInstrsPass() : MachineFunctionPass(ID), - TII(nullptr) { } + static char ID; + + R600ExpandSpecialInstrsPass() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -47,10 +55,17 @@ public: } }; -} // End anonymous namespace +} // end anonymous namespace + +INITIALIZE_PASS_BEGIN(R600ExpandSpecialInstrsPass, DEBUG_TYPE, + "R600 Expand Special Instrs", false, false) +INITIALIZE_PASS_END(R600ExpandSpecialInstrsPass, DEBUG_TYPE, + "R600ExpandSpecialInstrs", false, false) char R600ExpandSpecialInstrsPass::ID = 0; +char &llvm::R600ExpandSpecialInstrsPassID = R600ExpandSpecialInstrsPass::ID; + FunctionPass *llvm::createR600ExpandSpecialInstrsPass() { return new R600ExpandSpecialInstrsPass(); } @@ -117,7 +132,6 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { continue; } case AMDGPU::DOT_4: { - const R600RegisterInfo &TRI = TII->getRegisterInfo(); unsigned DstReg = MI.getOperand(0).getReg(); diff --git a/lib/Target/AMDGPU/R600FrameLowering.h b/lib/Target/AMDGPU/R600FrameLowering.h index 142f70967eda..fe367d73682f 100644 --- a/lib/Target/AMDGPU/R600FrameLowering.h +++ b/lib/Target/AMDGPU/R600FrameLowering.h @@ -27,6 +27,10 @@ public: MachineBasicBlock &MBB) const override {} int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; + + bool hasFP(const MachineFunction &MF) const override { + return false; + } }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 69a63b6941ef..66291d0be4e6 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -211,6 +211,11 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); + if (!Subtarget->hasFMA()) { + setOperationAction(ISD::FMA, MVT::f32, Expand); + setOperationAction(ISD::FMA, MVT::f64, Expand); + } + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; @@ -1145,7 +1150,9 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, // Load dword // TODO: can we be smarter about machine pointer info? - SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo()); + MachinePointerInfo PtrInfo(UndefValue::get( + Type::getInt32PtrTy(*DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS))); + SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo); Chain = Dst.getValue(1); @@ -1184,7 +1191,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, // Store dword // TODO: Can we be smarter about MachinePointerInfo? - SDValue NewStore = DAG.getStore(Chain, DL, Value, Ptr, MachinePointerInfo()); + SDValue NewStore = DAG.getStore(Chain, DL, Value, Ptr, PtrInfo); // If we are part of expanded vector, make our neighbors depend on this store if (VectorTrunc) { @@ -1308,39 +1315,39 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // return (512 + (kc_bank << 12) static int -ConstantAddressBlock(unsigned AddressSpace, AMDGPUAS AMDGPUASI) { +ConstantAddressBlock(unsigned AddressSpace) { switch (AddressSpace) { - case AMDGPUASI.CONSTANT_BUFFER_0: + case AMDGPUAS::CONSTANT_BUFFER_0: return 512; - case AMDGPUASI.CONSTANT_BUFFER_1: + case AMDGPUAS::CONSTANT_BUFFER_1: return 512 + 4096; - case AMDGPUASI.CONSTANT_BUFFER_2: + case AMDGPUAS::CONSTANT_BUFFER_2: return 512 + 4096 * 2; - case AMDGPUASI.CONSTANT_BUFFER_3: + case AMDGPUAS::CONSTANT_BUFFER_3: return 512 + 4096 * 3; - case AMDGPUASI.CONSTANT_BUFFER_4: + case AMDGPUAS::CONSTANT_BUFFER_4: return 512 + 4096 * 4; - case AMDGPUASI.CONSTANT_BUFFER_5: + case AMDGPUAS::CONSTANT_BUFFER_5: return 512 + 4096 * 5; - case AMDGPUASI.CONSTANT_BUFFER_6: + case AMDGPUAS::CONSTANT_BUFFER_6: return 512 + 4096 * 6; - case AMDGPUASI.CONSTANT_BUFFER_7: + case AMDGPUAS::CONSTANT_BUFFER_7: return 512 + 4096 * 7; - case AMDGPUASI.CONSTANT_BUFFER_8: + case AMDGPUAS::CONSTANT_BUFFER_8: return 512 + 4096 * 8; - case AMDGPUASI.CONSTANT_BUFFER_9: + case AMDGPUAS::CONSTANT_BUFFER_9: return 512 + 4096 * 9; - case AMDGPUASI.CONSTANT_BUFFER_10: + case AMDGPUAS::CONSTANT_BUFFER_10: return 512 + 4096 * 10; - case AMDGPUASI.CONSTANT_BUFFER_11: + case AMDGPUAS::CONSTANT_BUFFER_11: return 512 + 4096 * 11; - case AMDGPUASI.CONSTANT_BUFFER_12: + case AMDGPUAS::CONSTANT_BUFFER_12: return 512 + 4096 * 12; - case AMDGPUASI.CONSTANT_BUFFER_13: + case AMDGPUAS::CONSTANT_BUFFER_13: return 512 + 4096 * 13; - case AMDGPUASI.CONSTANT_BUFFER_14: + case AMDGPUAS::CONSTANT_BUFFER_14: return 512 + 4096 * 14; - case AMDGPUASI.CONSTANT_BUFFER_15: + case AMDGPUAS::CONSTANT_BUFFER_15: return 512 + 4096 * 15; default: return -1; @@ -1371,7 +1378,9 @@ SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op, // Load dword // TODO: can we be smarter about machine pointer info? - SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo()); + MachinePointerInfo PtrInfo(UndefValue::get( + Type::getInt32PtrTy(*DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS))); + SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, PtrInfo); // Get offset within the register. SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, @@ -1424,8 +1433,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { return scalarizeVectorLoad(LoadNode, DAG); } - int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace(), - AMDGPUASI); + int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); if (ConstantBlock > -1 && ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { diff --git a/lib/Target/AMDGPU/R600InstrFormats.td b/lib/Target/AMDGPU/R600InstrFormats.td index 68fcc545916a..61106ed42e64 100644 --- a/lib/Target/AMDGPU/R600InstrFormats.td +++ b/lib/Target/AMDGPU/R600InstrFormats.td @@ -11,9 +11,18 @@ // //===----------------------------------------------------------------------===// +def isR600 : Predicate<"Subtarget->getGeneration() <= R600Subtarget::R700">; + +def isR600toCayman : Predicate< + "Subtarget->getGeneration() <= R600Subtarget::NORTHERN_ISLANDS">; + +class R600Pat<dag pattern, dag result> : AMDGPUPat<pattern, result> { + let SubtargetPredicate = isR600toCayman; +} + class InstR600 <dag outs, dag ins, string asm, list<dag> pattern, - InstrItinClass itin> - : AMDGPUInst <outs, ins, asm, pattern> { + InstrItinClass itin = NoItinerary> + : AMDGPUInst <outs, ins, asm, pattern>, PredicateControl { field bits<64> Inst; bit Trig = 0; @@ -31,6 +40,7 @@ class InstR600 <dag outs, dag ins, string asm, list<dag> pattern, bit IsExport = 0; bit LDS_1A2D = 0; + let SubtargetPredicate = isR600toCayman; let Namespace = "AMDGPU"; let OutOperandList = outs; let InOperandList = ins; diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp index c5da5e404200..23e646c8147c 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -30,9 +30,9 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetRegisterInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -197,7 +197,7 @@ bool R600InstrInfo::usesVertexCache(unsigned Opcode) const { bool R600InstrInfo::usesVertexCache(const MachineInstr &MI) const { const MachineFunction *MF = MI.getParent()->getParent(); - return !AMDGPU::isCompute(MF->getFunction()->getCallingConv()) && + return !AMDGPU::isCompute(MF->getFunction().getCallingConv()) && usesVertexCache(MI.getOpcode()); } @@ -207,7 +207,7 @@ bool R600InstrInfo::usesTextureCache(unsigned Opcode) const { bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const { const MachineFunction *MF = MI.getParent()->getParent(); - return (AMDGPU::isCompute(MF->getFunction()->getCallingConv()) && + return (AMDGPU::isCompute(MF->getFunction().getCallingConv()) && usesVertexCache(MI.getOpcode())) || usesTextureCache(MI.getOpcode()); } @@ -1186,10 +1186,8 @@ int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const { } const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass(); - for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(), - LE = MRI.livein_end(); - LI != LE; ++LI) { - unsigned Reg = LI->first; + for (std::pair<unsigned, unsigned> LI : MRI.liveins()) { + unsigned Reg = LI.first; if (TargetRegisterInfo::isVirtualRegister(Reg) || !IndirectRC->contains(Reg)) continue; @@ -1495,3 +1493,21 @@ void R600InstrInfo::clearFlag(MachineInstr &MI, unsigned Operand, FlagOp.setImm(InstFlags); } } + +unsigned R600InstrInfo::getAddressSpaceForPseudoSourceKind( + PseudoSourceValue::PSVKind Kind) const { + switch (Kind) { + case PseudoSourceValue::Stack: + case PseudoSourceValue::FixedStack: + return AMDGPUASI.PRIVATE_ADDRESS; + case PseudoSourceValue::ConstantPool: + case PseudoSourceValue::GOT: + case PseudoSourceValue::JumpTable: + case PseudoSourceValue::GlobalValueCallEntry: + case PseudoSourceValue::ExternalSymbolCallEntry: + case PseudoSourceValue::TargetCustom: + return AMDGPUASI.CONSTANT_ADDRESS; + } + llvm_unreachable("Invalid pseudo source kind"); + return AMDGPUASI.PRIVATE_ADDRESS; +} diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h index 3b828006807e..abaa37450758 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.h +++ b/lib/Target/AMDGPU/R600InstrInfo.h @@ -318,6 +318,9 @@ public: bool isRegisterLoad(const MachineInstr &MI) const { return get(MI.getOpcode()).TSFlags & R600InstrFlags::REGISTER_LOAD; } + + unsigned getAddressSpaceForPseudoSourceKind( + PseudoSourceValue::PSVKind Kind) const override; }; namespace AMDGPU { diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td index bac557ba989e..801e4e61fca6 100644 --- a/lib/Target/AMDGPU/R600Instructions.td +++ b/lib/Target/AMDGPU/R600Instructions.td @@ -15,6 +15,13 @@ include "R600Intrinsics.td" include "R600InstrFormats.td" +// FIXME: Should not be arbitrarily split from other R600 inst classes. +class R600WrapperInst <dag outs, dag ins, string asm = "", list<dag> pattern = []> : + AMDGPUInst<outs, ins, asm, pattern>, PredicateControl { + let SubtargetPredicate = isR600toCayman; +} + + class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern = []> : InstR600 <outs, ins, asm, pattern, NullALU> { @@ -38,9 +45,7 @@ class InstFlag<string PM = "printOperand", int Default = 0> } // src_sel for ALU src operands, see also ALU_CONST, ALU_PARAM registers -def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))> { - let PrintMethod = "printSel"; -} +def SEL : OperandWithDefaultOps <i32, (ops (i32 -1))>; def BANK_SWIZZLE : OperandWithDefaultOps <i32, (ops (i32 0))> { let PrintMethod = "printBankSwizzle"; } @@ -348,12 +353,6 @@ def vtx_id2_az_extloadi8 : LoadVtxId2 <az_extloadi8>; def vtx_id2_az_extloadi16 : LoadVtxId2 <az_extloadi16>; def vtx_id2_load : LoadVtxId2 <load>; -def isR600 : Predicate<"Subtarget->getGeneration() <= R600Subtarget::R700">; - -def isR600toCayman - : Predicate< - "Subtarget->getGeneration() <= R600Subtarget::NORTHERN_ISLANDS">; - //===----------------------------------------------------------------------===// // R600 SDNodes //===----------------------------------------------------------------------===// @@ -395,7 +394,7 @@ def TEXTURE_FETCH_Type : SDTypeProfile<1, 19, [SDTCisFP<0>]>; def TEXTURE_FETCH: SDNode<"AMDGPUISD::TEXTURE_FETCH", TEXTURE_FETCH_Type, []>; multiclass TexPattern<bits<32> TextureOp, Instruction inst, ValueType vt = v4f32> { -def : Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR, +def : R600Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR, (i32 imm:$srcx), (i32 imm:$srcy), (i32 imm:$srcz), (i32 imm:$srcw), (i32 imm:$offsetx), (i32 imm:$offsety), (i32 imm:$offsetz), (i32 imm:$DST_SEL_X), (i32 imm:$DST_SEL_Y), (i32 imm:$DST_SEL_Z), @@ -481,7 +480,7 @@ class ExportBufWord1 { } multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> { - def : Pat<(R600_EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type), + def : R600Pat<(R600_EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type), (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)), (ExportInst R600_Reg128:$src, imm:$type, imm:$base, imm:$swz_x, imm:$swz_y, imm:$swz_z, imm:$swz_w, cf_inst, 0) @@ -492,22 +491,22 @@ multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> { multiclass SteamOutputExportPattern<Instruction ExportInst, bits<8> buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> { // Stream0 - def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src), + def : R600Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src), (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)), (ExportInst R600_Reg128:$src, 0, imm:$arraybase, 4095, imm:$mask, buf0inst, 0)>; // Stream1 - def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src), + def : R600Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src), (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)), (ExportInst $src, 0, imm:$arraybase, 4095, imm:$mask, buf1inst, 0)>; // Stream2 - def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src), + def : R600Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src), (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)), (ExportInst $src, 0, imm:$arraybase, 4095, imm:$mask, buf2inst, 0)>; // Stream3 - def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src), + def : R600Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src), (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)), (ExportInst $src, 0, imm:$arraybase, 4095, imm:$mask, buf3inst, 0)>; @@ -551,7 +550,7 @@ class ExportBufInst : InstR600ISA<( def KCACHE : InstFlag<"printKCache">; -class ALU_CLAUSE<bits<4> inst, string OpName> : AMDGPUInst <(outs), +class ALU_CLAUSE<bits<4> inst, string OpName> : R600WrapperInst <(outs), (ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1, KCACHE:$KCACHE_MODE0, KCACHE:$KCACHE_MODE1, i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1, @@ -580,7 +579,7 @@ class CF_WORD0_R600 { let Word0 = ADDR; } -class CF_CLAUSE_R600 <bits<7> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs), +class CF_CLAUSE_R600 <bits<7> inst, dag ins, string AsmPrint> : R600WrapperInst <(outs), ins, AsmPrint, [] >, CF_WORD0_R600, CF_WORD1_R600 { field bits<64> Inst; bits<4> CNT; @@ -600,7 +599,7 @@ ins, AsmPrint, [] >, CF_WORD0_R600, CF_WORD1_R600 { let Inst{63-32} = Word1; } -class CF_CLAUSE_EG <bits<8> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs), +class CF_CLAUSE_EG <bits<8> inst, dag ins, string AsmPrint> : R600WrapperInst <(outs), ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG { field bits<64> Inst; @@ -623,7 +622,7 @@ def CF_ALU_CONTINUE : ALU_CLAUSE<13, "ALU_CONTINUE">; def CF_ALU_BREAK : ALU_CLAUSE<14, "ALU_BREAK">; def CF_ALU_ELSE_AFTER : ALU_CLAUSE<15, "ALU_ELSE_AFTER">; -def FETCH_CLAUSE : AMDGPUInst <(outs), +def FETCH_CLAUSE : R600WrapperInst <(outs), (ins i32imm:$addr), "Fetch clause starting at $addr:", [] > { field bits<8> Inst; bits<8> num; @@ -631,7 +630,7 @@ def FETCH_CLAUSE : AMDGPUInst <(outs), let isCodeGenOnly = 1; } -def ALU_CLAUSE : AMDGPUInst <(outs), +def ALU_CLAUSE : R600WrapperInst <(outs), (ins i32imm:$addr), "ALU clause starting at $addr:", [] > { field bits<8> Inst; bits<8> num; @@ -639,7 +638,7 @@ def ALU_CLAUSE : AMDGPUInst <(outs), let isCodeGenOnly = 1; } -def LITERALS : AMDGPUInst <(outs), +def LITERALS : R600WrapperInst <(outs), (ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > { let isCodeGenOnly = 1; @@ -651,16 +650,68 @@ def LITERALS : AMDGPUInst <(outs), let Inst{63-32} = literal2; } -def PAD : AMDGPUInst <(outs), (ins), "PAD", [] > { +def PAD : R600WrapperInst <(outs), (ins), "PAD", [] > { field bits<64> Inst; } -let Predicates = [isR600toCayman] in { - //===----------------------------------------------------------------------===// // Common Instructions R600, R700, Evergreen, Cayman //===----------------------------------------------------------------------===// +let isCodeGenOnly = 1, isPseudo = 1 in { + +let usesCustomInserter = 1 in { + +class CLAMP <RegisterClass rc> : AMDGPUShaderInst < + (outs rc:$dst), + (ins rc:$src0), + "CLAMP $dst, $src0", + [(set f32:$dst, (AMDGPUclamp f32:$src0))] +>; + +class FABS <RegisterClass rc> : AMDGPUShaderInst < + (outs rc:$dst), + (ins rc:$src0), + "FABS $dst, $src0", + [(set f32:$dst, (fabs f32:$src0))] +>; + +class FNEG <RegisterClass rc> : AMDGPUShaderInst < + (outs rc:$dst), + (ins rc:$src0), + "FNEG $dst, $src0", + [(set f32:$dst, (fneg f32:$src0))] +>; + +} // usesCustomInserter = 1 + +multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass, + ComplexPattern addrPat> { +let UseNamedOperandTable = 1 in { + + def RegisterLoad : AMDGPUShaderInst < + (outs dstClass:$dst), + (ins addrClass:$addr, i32imm:$chan), + "RegisterLoad $dst, $addr", + [(set i32:$dst, (AMDGPUregister_load addrPat:$addr, (i32 timm:$chan)))] + > { + let isRegisterLoad = 1; + } + + def RegisterStore : AMDGPUShaderInst < + (outs), + (ins dstClass:$val, addrClass:$addr, i32imm:$chan), + "RegisterStore $val, $addr", + [(AMDGPUregister_store i32:$val, addrPat:$addr, (i32 timm:$chan))] + > { + let isRegisterStore = 1; + } +} +} + +} // End isCodeGenOnly = 1, isPseudo = 1 + + def ADD : R600_2OP_Helper <0x0, "ADD", fadd>; // Non-IEEE MUL: 0 * anything = 0 def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE">; @@ -732,7 +783,7 @@ def MOV : R600_1OP <0x19, "MOV", []>; // Most DUMMY_CHAINs should be eliminated during legalization, but undef // values can sneak in some to selection. let isPseudo = 1, isCodeGenOnly = 1 in { -def DUMMY_CHAIN : AMDGPUInst < +def DUMMY_CHAIN : R600WrapperInst < (outs), (ins), "DUMMY_CHAIN", @@ -743,7 +794,7 @@ def DUMMY_CHAIN : AMDGPUInst < let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in { -class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst < +class MOV_IMM <ValueType vt, Operand immType> : R600WrapperInst < (outs R600_Reg32:$dst), (ins immType:$imm), "", @@ -753,20 +804,20 @@ class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst < } // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 def MOV_IMM_I32 : MOV_IMM<i32, i32imm>; -def : Pat < +def : R600Pat < (imm:$val), (MOV_IMM_I32 imm:$val) >; def MOV_IMM_GLOBAL_ADDR : MOV_IMM<iPTR, i32imm>; -def : Pat < +def : R600Pat < (AMDGPUconstdata_ptr tglobaladdr:$addr), (MOV_IMM_GLOBAL_ADDR tglobaladdr:$addr) >; def MOV_IMM_F32 : MOV_IMM<f32, f32imm>; -def : Pat < +def : R600Pat < (fpimm:$val), (MOV_IMM_F32 fpimm:$val) >; @@ -938,7 +989,10 @@ class MULADD_IEEE_Common <bits<5> inst> : R600_3OP < class FMA_Common <bits<5> inst> : R600_3OP < inst, "FMA", [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))], VecALU ->; +> +{ + let OtherPredicates = [FMA]; +} class CNDE_Common <bits<5> inst> : R600_3OP < inst, "CNDE", @@ -1149,7 +1203,7 @@ def FNEG_R600 : FNEG<R600_Reg32>; // FIXME: Should be predicated on unsafe fp math. multiclass DIV_Common <InstR600 recip_ieee> { -def : Pat< +def : R600Pat< (fdiv f32:$src0, f32:$src1), (MUL_IEEE $src0, (recip_ieee $src1)) >; @@ -1196,7 +1250,7 @@ let Predicates = [isR600] in { defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>; def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>; - def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; + def : R600Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; def : RsqPat<RECIPSQRT_IEEE_r600, f32>; def R600_ExportSwz : ExportSwzInst { @@ -1284,11 +1338,11 @@ defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>; // Hardcode channel to 0 // NOTE: LSHR is not available here. LSHR is per family instruction -def : Pat < +def : R600Pat < (i32 (load_private ADDRIndirect:$addr) ), (R600_RegisterLoad FRAMEri:$addr, (i32 0)) >; -def : Pat < +def : R600Pat < (store_private i32:$val, ADDRIndirect:$addr), (R600_RegisterStore i32:$val, FRAMEri:$addr, (i32 0)) >; @@ -1639,7 +1693,7 @@ def R600_INSERT_ELT_V2 : InsertVertical <R600_Reg64Vertical>; def R600_INSERT_ELT_V4 : InsertVertical <R600_Reg128Vertical>; class ExtractVerticalPat <Instruction inst, ValueType vec_ty, - ValueType scalar_ty> : Pat < + ValueType scalar_ty> : R600Pat < (scalar_ty (extractelt vec_ty:$vec, i32:$index)), (inst $vec, $index) >; @@ -1650,7 +1704,7 @@ def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4i32, i32>; def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4f32, f32>; class InsertVerticalPat <Instruction inst, ValueType vec_ty, - ValueType scalar_ty> : Pat < + ValueType scalar_ty> : R600Pat < (vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)), (inst $vec, $value, $index) >; @@ -1664,9 +1718,11 @@ def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>; // ISel Patterns //===----------------------------------------------------------------------===// +let SubtargetPredicate = isR600toCayman in { + // CND*_INT Patterns for f32 True / False values -class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat < +class CND_INT_f32 <InstR600 cnd, CondCode cc> : R600Pat < (selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc), (cnd $src0, $src1, $src2) >; @@ -1676,18 +1732,18 @@ def : CND_INT_f32 <CNDGT_INT, SETGT>; def : CND_INT_f32 <CNDGE_INT, SETGE>; //CNDGE_INT extra pattern -def : Pat < +def : R600Pat < (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_SGT), (CNDGE_INT $src0, $src1, $src2) >; // KIL Patterns -def KILP : Pat < +def KILP : R600Pat < (int_AMDGPU_kilp), (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO))) >; -def KIL : Pat < +def KIL : R600Pat < (int_AMDGPU_kill f32:$src0), (MASK_WRITE (KILLGT (f32 ZERO), $src0)) >; @@ -1736,7 +1792,7 @@ def : BitConvert <v4i32, v4f32, R600_Reg128>; // DWORDADDR pattern def : DwordAddrPat <i32, R600_Reg32>; -} // End isR600toCayman Predicate +} // End SubtargetPredicate = isR600toCayman def getLDSNoRetOp : InstrMapping { let FilterClass = "R600_LDS_1A1D"; diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index 502dd3bce97e..4a14d95f1cc4 100644 --- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -1,4 +1,4 @@ -//===--------------------- R600MergeVectorRegisters.cpp -------------------===// +//===- R600MergeVectorRegisters.cpp ---------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -12,16 +12,16 @@ /// common data and/or have enough undef subreg using swizzle abilities. /// /// For instance let's consider the following pseudo code : -/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3 +/// %5 = REG_SEQ %1, sub0, %2, sub1, %3, sub2, undef, sub3 /// ... -/// vreg7<def> = REG_SEQ vreg1, sub0, vreg3, sub1, undef, sub2, vreg4, sub3 -/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub1, sub2, sub3 +/// %7 = REG_SEQ %1, sub0, %3, sub1, undef, sub2, %4, sub3 +/// (swizzable Inst) %7, SwizzleMask : sub0, sub1, sub2, sub3 /// /// is turned into : -/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3 +/// %5 = REG_SEQ %1, sub0, %2, sub1, %3, sub2, undef, sub3 /// ... -/// vreg7<def> = INSERT_SUBREG vreg4, sub3 -/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub2, sub1, sub3 +/// %7 = INSERT_SUBREG %4, sub3 +/// (swizzable Inst) %7, SwizzleMask : sub0, sub2, sub1, sub3 /// /// This allow regalloc to reduce register pressure for vector registers and /// to reduce MOV count. @@ -44,7 +44,7 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/DebugLoc.h" -#include "llvm/PassAnalysisSupport.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -98,8 +98,13 @@ public: class R600VectorRegMerger : public MachineFunctionPass { private: + using InstructionSetMap = DenseMap<unsigned, std::vector<MachineInstr *>>; + MachineRegisterInfo *MRI; - const R600InstrInfo *TII; + const R600InstrInfo *TII = nullptr; + DenseMap<MachineInstr *, RegSeqInfo> PreviousRegSeq; + InstructionSetMap PreviousRegSeqByReg; + InstructionSetMap PreviousRegSeqByUndefCount; bool canSwizzle(const MachineInstr &MI) const; bool areAllUsesSwizzeable(unsigned Reg) const; @@ -116,16 +121,10 @@ private: void RemoveMI(MachineInstr *); void trackRSI(const RegSeqInfo &RSI); - typedef DenseMap<unsigned, std::vector<MachineInstr *>> InstructionSetMap; - DenseMap<MachineInstr *, RegSeqInfo> PreviousRegSeq; - InstructionSetMap PreviousRegSeqByReg; - InstructionSetMap PreviousRegSeqByUndefCount; - public: static char ID; - R600VectorRegMerger() : MachineFunctionPass(ID), - TII(nullptr) { } + R600VectorRegMerger() : MachineFunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -143,10 +142,17 @@ public: bool runOnMachineFunction(MachineFunction &Fn) override; }; -} // end anonymous namespace. +} // end anonymous namespace + +INITIALIZE_PASS_BEGIN(R600VectorRegMerger, DEBUG_TYPE, + "R600 Vector Reg Merger", false, false) +INITIALIZE_PASS_END(R600VectorRegMerger, DEBUG_TYPE, + "R600 Vector Reg Merger", false, false) char R600VectorRegMerger::ID = 0; +char &llvm::R600VectorRegMergerID = R600VectorRegMerger::ID; + bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI) const { if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) @@ -330,7 +336,7 @@ void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) { } bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { - if (skipFunction(*Fn.getFunction())) + if (skipFunction(Fn.getFunction())) return false; const R600Subtarget &ST = Fn.getSubtarget<R600Subtarget>(); diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp index 1cb40938cee7..7340318d2d88 100644 --- a/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/lib/Target/AMDGPU/R600Packetizer.cpp @@ -51,7 +51,6 @@ public: bool runOnMachineFunction(MachineFunction &Fn) override; }; -char R600Packetizer::ID = 0; class R600PacketizerList : public VLIWPacketizerList { private: @@ -404,6 +403,15 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { } // end anonymous namespace +INITIALIZE_PASS_BEGIN(R600Packetizer, DEBUG_TYPE, + "R600 Packetizer", false, false) +INITIALIZE_PASS_END(R600Packetizer, DEBUG_TYPE, + "R600 Packetizer", false, false) + +char R600Packetizer::ID = 0; + +char &llvm::R600PacketizerID = R600Packetizer::ID; + llvm::FunctionPass *llvm::createR600Packetizer() { return new R600Packetizer(); } diff --git a/lib/Target/AMDGPU/R600Processors.td b/lib/Target/AMDGPU/R600Processors.td new file mode 100644 index 000000000000..89194dc1bdf6 --- /dev/null +++ b/lib/Target/AMDGPU/R600Processors.td @@ -0,0 +1,90 @@ +//===-- R600Processors.td - R600 Processor definitions --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Radeon HD 2000/3000 Series (R600). +//===----------------------------------------------------------------------===// + +def : Processor<"r600", R600_VLIW5_Itin, + [FeatureR600, FeatureWavefrontSize64, FeatureVertexCache] +>; + +def : Processor<"r630", R600_VLIW5_Itin, + [FeatureR600, FeatureWavefrontSize32, FeatureVertexCache] +>; + +def : Processor<"rs880", R600_VLIW5_Itin, + [FeatureR600, FeatureWavefrontSize16] +>; + +def : Processor<"rv670", R600_VLIW5_Itin, + [FeatureR600, FeatureWavefrontSize64, FeatureVertexCache] +>; + +//===----------------------------------------------------------------------===// +// Radeon HD 4000 Series (R700). +//===----------------------------------------------------------------------===// + +def : Processor<"rv710", R600_VLIW5_Itin, + [FeatureR700, FeatureWavefrontSize32, FeatureVertexCache] +>; + +def : Processor<"rv730", R600_VLIW5_Itin, + [FeatureR700, FeatureWavefrontSize32, FeatureVertexCache] +>; + +def : Processor<"rv770", R600_VLIW5_Itin, + [FeatureR700, FeatureWavefrontSize64, FeatureVertexCache] +>; + +//===----------------------------------------------------------------------===// +// Radeon HD 5000 Series (Evergreen). +//===----------------------------------------------------------------------===// + +def : Processor<"cedar", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureWavefrontSize32, FeatureVertexCache, + FeatureCFALUBug] +>; + +def : Processor<"cypress", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureWavefrontSize64, FeatureVertexCache, FeatureFMA] +>; + +def : Processor<"juniper", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureWavefrontSize64, FeatureVertexCache] +>; + +def : Processor<"redwood", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureWavefrontSize64, FeatureVertexCache, + FeatureCFALUBug] +>; + +def : Processor<"sumo", R600_VLIW5_Itin, + [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug] +>; + +//===----------------------------------------------------------------------===// +// Radeon HD 6000 Series (Northern Islands). +//===----------------------------------------------------------------------===// + +def : Processor<"barts", R600_VLIW5_Itin, + [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug] +>; + +def : Processor<"caicos", R600_VLIW5_Itin, + [FeatureNorthernIslands, FeatureCFALUBug] +>; + +def : Processor<"cayman", R600_VLIW4_Itin, + [FeatureNorthernIslands, FeatureCaymanISA, FeatureFMA] +>; + +def : Processor<"turks", R600_VLIW5_Itin, + [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug] +>; diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td index 3c1e8527284c..84ab328bdb2b 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.td +++ b/lib/Target/AMDGPU/R600RegisterInfo.td @@ -147,6 +147,7 @@ def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>; def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>; def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>; def AR_X : R600Reg<"AR.x", 0>; +def INDIRECT_BASE_ADDR : R600Reg <"INDIRECT_BASE_ADDR", 0>; def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32, (add (sequence "ArrayBase%u", 448, 480))>; diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index 8cb35c506135..150d8c3dc3d3 100644 --- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -1,4 +1,4 @@ -//===-- SIAnnotateControlFlow.cpp - ------------------===// +//===- SIAnnotateControlFlow.cpp ------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -14,16 +14,32 @@ #include "AMDGPU.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" +#include <cassert> +#include <utility> using namespace llvm; @@ -32,8 +48,8 @@ using namespace llvm; namespace { // Complex types used in this pass -typedef std::pair<BasicBlock *, Value *> StackEntry; -typedef SmallVector<StackEntry, 16> StackVector; +using StackEntry = std::pair<BasicBlock *, Value *>; +using StackVector = SmallVector<StackEntry, 16>; class SIAnnotateControlFlow : public FunctionPass { DivergenceAnalysis *DA; @@ -89,8 +105,7 @@ class SIAnnotateControlFlow : public FunctionPass { public: static char ID; - SIAnnotateControlFlow(): - FunctionPass(ID) { } + SIAnnotateControlFlow() : FunctionPass(ID) {} bool doInitialization(Module &M) override; @@ -105,7 +120,6 @@ public: AU.addPreserved<DominatorTreeWrapperPass>(); FunctionPass::getAnalysisUsage(AU); } - }; } // end anonymous namespace @@ -186,7 +200,7 @@ bool SIAnnotateControlFlow::isElse(PHINode *Phi) { // \brief Erase "Phi" if it is not used any more void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { - if (llvm::RecursivelyDeleteDeadPHINode(Phi)) { + if (RecursivelyDeleteDeadPHINode(Phi)) { DEBUG(dbgs() << "Erased unused condition phi\n"); } } @@ -215,7 +229,6 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) { Value *SIAnnotateControlFlow::handleLoopCondition( Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term, SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions) { - // Only search through PHI nodes which are inside the loop. If we try this // with PHI nodes that are outside of the loop, we end up inserting new PHI // nodes outside of the loop which depend on values defined inside the loop. @@ -223,7 +236,6 @@ Value *SIAnnotateControlFlow::handleLoopCondition( // 'Instruction does not dominate all users!' errors. PHINode *Phi = nullptr; if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) { - BasicBlock *Parent = Phi->getParent(); PHINode *NewPhi = PHINode::Create(Int64, 0, "loop.phi", &Parent->front()); Value *Ret = NewPhi; @@ -333,7 +345,7 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); - for (WeakTrackingVH Val : reverse(LoopPhiConditions)) { + for (WeakTrackingVH Val : llvm::reverse(LoopPhiConditions)) { if (PHINode *Cond = cast_or_null<PHINode>(Val)) eraseIfUnused(Cond); } @@ -360,7 +372,7 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { Preds.push_back(Pred); } - BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false); + BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false); } Value *Exec = popSaved(); diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index 3915c0e5bdbe..a9f6069e798a 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -67,7 +67,25 @@ enum : uint64_t { SCALAR_STORE = UINT64_C(1) << 39, FIXED_SIZE = UINT64_C(1) << 40, VOPAsmPrefer32Bit = UINT64_C(1) << 41, - HasFPClamp = UINT64_C(1) << 42 + VOP3_OPSEL = UINT64_C(1) << 42, + maybeAtomic = UINT64_C(1) << 43, + renamedInGFX9 = UINT64_C(1) << 44, + + // Is a clamp on FP type. + FPClamp = UINT64_C(1) << 45, + + // Is an integer clamp + IntClamp = UINT64_C(1) << 46, + + // Clamps lo component of register. + ClampLo = UINT64_C(1) << 47, + + // Clamps hi component of register. + // ClampLo and ClampHi set for packed clamp. + ClampHi = UINT64_C(1) << 48, + + // Is a packed VOP3P instruction. + IsPacked = UINT64_C(1) << 49 }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. @@ -137,7 +155,8 @@ namespace SISrcMods { SEXT = 1 << 0, // Integer sign-extend modifier NEG_HI = ABS, // Floating-point negate high packed component modifier. OP_SEL_0 = 1 << 2, - OP_SEL_1 = 1 << 3 + OP_SEL_1 = 1 << 3, + DST_OP_SEL = 1 << 3 // VOP3 dst op_sel (share mask with OP_SEL_1) }; } @@ -175,8 +194,10 @@ namespace EncValues { // Encoding values of enum9/8/7 operands enum { SGPR_MIN = 0, SGPR_MAX = 101, - TTMP_MIN = 112, - TTMP_MAX = 123, + TTMP_VI_MIN = 112, + TTMP_VI_MAX = 123, + TTMP_GFX9_MIN = 108, + TTMP_GFX9_MAX = 123, INLINE_INTEGER_C_MIN = 128, INLINE_INTEGER_C_POSITIVE_MAX = 192, // 64 INLINE_INTEGER_C_MAX = 208, @@ -349,6 +370,8 @@ enum SDWA9EncValues{ SRC_VGPR_MAX = 255, SRC_SGPR_MIN = 256, SRC_SGPR_MAX = 357, + SRC_TTMP_MIN = 364, + SRC_TTMP_MAX = 379, }; } // namespace SDWA @@ -359,7 +382,9 @@ enum SDWA9EncValues{ #define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8) #define R_00B128_SPI_SHADER_PGM_RSRC1_VS 0x00B128 #define R_00B228_SPI_SHADER_PGM_RSRC1_GS 0x00B228 +#define R_00B328_SPI_SHADER_PGM_RSRC1_ES 0x00B328 #define R_00B428_SPI_SHADER_PGM_RSRC1_HS 0x00B428 +#define R_00B528_SPI_SHADER_PGM_RSRC1_LS 0x00B528 #define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 #define S_00B028_VGPRS(x) (((x) & 0x3F) << 0) #define S_00B028_SGPRS(x) (((x) & 0x0F) << 6) diff --git a/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp b/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp deleted file mode 100644 index d4d3959658e7..000000000000 --- a/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp +++ /dev/null @@ -1,88 +0,0 @@ -//===-- SIFixControlFlowLiveIntervals.cpp - Fix CF live intervals ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// \brief Spilling of EXEC masks used for control flow messes up control flow -/// lowering, so mark all live intervals associated with CF instructions as -/// non-spillable. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "SIInstrInfo.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-fix-cf-live-intervals" - -namespace { - -class SIFixControlFlowLiveIntervals : public MachineFunctionPass { -public: - static char ID; - -public: - SIFixControlFlowLiveIntervals() : MachineFunctionPass(ID) { - initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - StringRef getPassName() const override { return "SI Fix CF Live Intervals"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LiveIntervals>(); - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SIFixControlFlowLiveIntervals, DEBUG_TYPE, - "SI Fix CF Live Intervals", false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_END(SIFixControlFlowLiveIntervals, DEBUG_TYPE, - "SI Fix CF Live Intervals", false, false) - -char SIFixControlFlowLiveIntervals::ID = 0; - -char &llvm::SIFixControlFlowLiveIntervalsID = SIFixControlFlowLiveIntervals::ID; - -FunctionPass *llvm::createSIFixControlFlowLiveIntervalsPass() { - return new SIFixControlFlowLiveIntervals(); -} - -bool SIFixControlFlowLiveIntervals::runOnMachineFunction(MachineFunction &MF) { - LiveIntervals *LIS = &getAnalysis<LiveIntervals>(); - - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { - switch (MI.getOpcode()) { - case AMDGPU::SI_IF: - case AMDGPU::SI_ELSE: - case AMDGPU::SI_BREAK: - case AMDGPU::SI_IF_BREAK: - case AMDGPU::SI_ELSE_BREAK: - case AMDGPU::SI_END_CF: { - unsigned Reg = MI.getOperand(0).getReg(); - LIS->getInterval(Reg).markNotSpillable(); - break; - } - default: - break; - } - } - } - - return false; -} diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 0a795c99f94e..8b155c2d2780 100644 --- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -1,4 +1,4 @@ -//===-- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies --------===// +//===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===// // // The LLVM Compiler Infrastructure // @@ -14,46 +14,46 @@ /// Register Class <vsrc> is the union of <vgpr> and <sgpr> /// /// BB0: -/// %vreg0 <sgpr> = SCALAR_INST -/// %vreg1 <vsrc> = COPY %vreg0 <sgpr> +/// %0 <sgpr> = SCALAR_INST +/// %1 <vsrc> = COPY %0 <sgpr> /// ... /// BRANCH %cond BB1, BB2 /// BB1: -/// %vreg2 <vgpr> = VECTOR_INST -/// %vreg3 <vsrc> = COPY %vreg2 <vgpr> +/// %2 <vgpr> = VECTOR_INST +/// %3 <vsrc> = COPY %2 <vgpr> /// BB2: -/// %vreg4 <vsrc> = PHI %vreg1 <vsrc>, <BB#0>, %vreg3 <vrsc>, <BB#1> -/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <vsrc> +/// %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1> +/// %5 <vgpr> = VECTOR_INST %4 <vsrc> /// /// /// The coalescer will begin at BB0 and eliminate its copy, then the resulting /// code will look like this: /// /// BB0: -/// %vreg0 <sgpr> = SCALAR_INST +/// %0 <sgpr> = SCALAR_INST /// ... /// BRANCH %cond BB1, BB2 /// BB1: -/// %vreg2 <vgpr> = VECTOR_INST -/// %vreg3 <vsrc> = COPY %vreg2 <vgpr> +/// %2 <vgpr> = VECTOR_INST +/// %3 <vsrc> = COPY %2 <vgpr> /// BB2: -/// %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <vsrc>, <BB#1> -/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr> +/// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1> +/// %5 <vgpr> = VECTOR_INST %4 <sgpr> /// /// Now that the result of the PHI instruction is an SGPR, the register -/// allocator is now forced to constrain the register class of %vreg3 to +/// allocator is now forced to constrain the register class of %3 to /// <sgpr> so we end up with final code like this: /// /// BB0: -/// %vreg0 <sgpr> = SCALAR_INST +/// %0 <sgpr> = SCALAR_INST /// ... /// BRANCH %cond BB1, BB2 /// BB1: -/// %vreg2 <vgpr> = VECTOR_INST -/// %vreg3 <sgpr> = COPY %vreg2 <vgpr> +/// %2 <vgpr> = VECTOR_INST +/// %3 <sgpr> = COPY %2 <vgpr> /// BB2: -/// %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <sgpr>, <BB#1> -/// %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr> +/// %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1> +/// %5 <vgpr> = VECTOR_INST %4 <sgpr> /// /// Now this code contains an illegal copy from a VGPR to an SGPR. /// @@ -68,14 +68,34 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "SIRegisterInfo.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/Pass.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" +#include <cassert> +#include <cstdint> +#include <iterator> +#include <list> +#include <map> +#include <tuple> +#include <utility> using namespace llvm; @@ -89,13 +109,17 @@ static cl::opt<bool> EnableM0Merge( namespace { class SIFixSGPRCopies : public MachineFunctionPass { - MachineDominatorTree *MDT; - + MachinePostDominatorTree *MPDT; + DenseMap<MachineBasicBlock *, SetVector<MachineBasicBlock*>> PDF; + void computePDF(MachineFunction * MF); +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void printPDF(); +#endif public: static char ID; - SIFixSGPRCopies() : MachineFunctionPass(ID) { } + SIFixSGPRCopies() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -104,12 +128,14 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineDominatorTree>(); AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachinePostDominatorTree>(); + AU.addPreserved<MachinePostDominatorTree>(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } }; -} // End anonymous namespace +} // end anonymous namespace INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE, "SI Fix SGPR copies", false, false) @@ -117,7 +143,6 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE, "SI Fix SGPR copies", false, false) - char SIFixSGPRCopies::ID = 0; char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID; @@ -287,7 +312,6 @@ static bool phiHasVGPROperands(const MachineInstr &PHI, const MachineRegisterInfo &MRI, const SIRegisterInfo *TRI, const SIInstrInfo *TII) { - for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) { unsigned Reg = PHI.getOperand(i).getReg(); if (TRI->hasVGPRs(MRI.getRegClass(Reg))) @@ -295,10 +319,10 @@ static bool phiHasVGPROperands(const MachineInstr &PHI, } return false; } + static bool phiHasBreakDef(const MachineInstr &PHI, const MachineRegisterInfo &MRI, SmallSet<unsigned, 8> &Visited) { - for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) { unsigned Reg = PHI.getOperand(i).getReg(); if (Visited.count(Reg)) @@ -337,6 +361,8 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, const SIInstrInfo *TII, unsigned &SMovOp, int64_t &Imm) { + if (Copy->getOpcode() != AMDGPU::COPY) + return false; if (!MoveImm->isMoveImmediate()) return false; @@ -368,13 +394,12 @@ template <class UnaryPredicate> bool searchPredecessors(const MachineBasicBlock *MBB, const MachineBasicBlock *CutOff, UnaryPredicate Predicate) { - if (MBB == CutOff) return false; - DenseSet<const MachineBasicBlock*> Visited; - SmallVector<MachineBasicBlock*, 4> Worklist(MBB->pred_begin(), - MBB->pred_end()); + DenseSet<const MachineBasicBlock *> Visited; + SmallVector<MachineBasicBlock *, 4> Worklist(MBB->pred_begin(), + MBB->pred_end()); while (!Worklist.empty()) { MachineBasicBlock *MBB = Worklist.pop_back_val(); @@ -392,12 +417,6 @@ bool searchPredecessors(const MachineBasicBlock *MBB, return false; } -static bool predsHasDivergentTerminator(MachineBasicBlock *MBB, - const TargetRegisterInfo *TRI) { - return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) { - return hasTerminatorThatModifiesExec(*MBB, *TRI); }); -} - // Checks if there is potential path From instruction To instruction. // If CutOff is specified and it sits in between of that path we ignore // a higher portion of the path and report it is not reachable. @@ -430,7 +449,7 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, const MachineRegisterInfo &MRI, MachineDominatorTree &MDT) { // List of inits by immediate value. - typedef std::map<unsigned, std::list<MachineInstr*>> InitListMap; + using InitListMap = std::map<unsigned, std::list<MachineInstr *>>; InitListMap Inits; // List of clobbering instructions. SmallVector<MachineInstr*, 8> Clobbers; @@ -487,16 +506,18 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, MDT.properlyDominates(Clobber->getParent(), MBBTo)); }; - return (any_of(Clobbers, interferes)) || - (any_of(Inits, [&](InitListMap::value_type &C) { - return C.first != Init.first && any_of(C.second, interferes); + return (llvm::any_of(Clobbers, interferes)) || + (llvm::any_of(Inits, [&](InitListMap::value_type &C) { + return C.first != Init.first && + llvm::any_of(C.second, interferes); })); }; if (MDT.dominates(MI1, MI2)) { if (!intereferes(MI2, MI1)) { - DEBUG(dbgs() << "Erasing from BB#" << MI2->getParent()->getNumber() - << " " << *MI2); + DEBUG(dbgs() << "Erasing from " + << printMBBReference(*MI2->getParent()) << " " + << *MI2); MI2->eraseFromParent(); Defs.erase(I2++); Changed = true; @@ -504,8 +525,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, } } else if (MDT.dominates(MI2, MI1)) { if (!intereferes(MI1, MI2)) { - DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber() - << " " << *MI1); + DEBUG(dbgs() << "Erasing from " + << printMBBReference(*MI1->getParent()) << " " + << *MI1); MI1->eraseFromParent(); Defs.erase(I1++); Changed = true; @@ -521,10 +543,11 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, MachineBasicBlock::iterator I = MBB->getFirstNonPHI(); if (!intereferes(MI1, I) && !intereferes(MI2, I)) { - DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber() - << " " << *MI1 << "and moving from BB#" - << MI2->getParent()->getNumber() << " to BB#" - << I->getParent()->getNumber() << " " << *MI2); + DEBUG(dbgs() << "Erasing from " + << printMBBReference(*MI1->getParent()) << " " << *MI1 + << "and moving from " + << printMBBReference(*MI2->getParent()) << " to " + << printMBBReference(*I->getParent()) << " " << *MI2); I->getParent()->splice(I, MI2->getParent(), MI2); MI1->eraseFromParent(); Defs.erase(I1++); @@ -544,18 +567,52 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, return Changed; } +void SIFixSGPRCopies::computePDF(MachineFunction *MF) { + MachineFunction::iterator B = MF->begin(); + MachineFunction::iterator E = MF->end(); + for (; B != E; ++B) { + if (B->succ_size() > 1) { + for (auto S : B->successors()) { + MachineDomTreeNode *runner = MPDT->getNode(&*S); + MachineDomTreeNode *sentinel = MPDT->getNode(&*B)->getIDom(); + while (runner && runner != sentinel) { + PDF[runner->getBlock()].insert(&*B); + runner = runner->getIDom(); + } + } + } + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void SIFixSGPRCopies::printPDF() { + dbgs() << "\n######## PostDominanceFrontiers set #########\n"; + for (auto &I : PDF) { + dbgs() << "PDF[ " << I.first->getNumber() << "] : "; + for (auto &J : I.second) { + dbgs() << J->getNumber() << ' '; + } + dbgs() << '\n'; + } + dbgs() << "\n##############################################\n"; +} +#endif + bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); MachineRegisterInfo &MRI = MF.getRegInfo(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); MDT = &getAnalysis<MachineDominatorTree>(); + MPDT = &getAnalysis<MachinePostDominatorTree>(); + PDF.clear(); + computePDF(&MF); + DEBUG(printPDF()); SmallVector<MachineInstr *, 16> Worklist; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { - MachineBasicBlock &MBB = *BI; for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { @@ -564,7 +621,9 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { switch (MI.getOpcode()) { default: continue; - case AMDGPU::COPY: { + case AMDGPU::COPY: + case AMDGPU::WQM: + case AMDGPU::WWM: { // If the destination register is a physical register there isn't really // much we can do to fix this. if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) @@ -602,14 +661,27 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) break; - // We don't need to fix the PHI if the common dominator of the - // two incoming blocks terminates with a uniform branch. - if (MI.getNumExplicitOperands() == 5) { - MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB(); - MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB(); - - if (!predsHasDivergentTerminator(MBB0, TRI) && - !predsHasDivergentTerminator(MBB1, TRI)) { + // We don't need to fix the PHI if all the source blocks + // have no divergent control dependecies + bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII); + if (!HasVGPROperand) { + bool Uniform = true; + MachineBasicBlock * Join = MI.getParent(); + for (auto &O : MI.explicit_operands()) { + if (O.isMBB()) { + MachineBasicBlock * Source = O.getMBB(); + SetVector<MachineBasicBlock*> &SourcePDF = PDF[Source]; + SetVector<MachineBasicBlock*> &JoinPDF = PDF[Join]; + SetVector<MachineBasicBlock*> CDList; + for (auto &I : SourcePDF) { + if (!JoinPDF.count(I) || /* back edge */MDT->dominates(Join, I)) { + if (hasTerminatorThatModifiesExec(*I, *TRI)) + Uniform = false; + } + } + } + } + if (Uniform) { DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n'); break; } @@ -649,14 +721,13 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { // is no chance for values to be over-written. SmallSet<unsigned, 8> Visited; - if (phiHasVGPROperands(MI, MRI, TRI, TII) || - !phiHasBreakDef(MI, MRI, Visited)) { + if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) { DEBUG(dbgs() << "Fixing PHI: " << MI); TII->moveToVALU(MI); } break; } - case AMDGPU::REG_SEQUENCE: { + case AMDGPU::REG_SEQUENCE: if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) || !hasVGPROperands(MI, TRI)) { foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI); @@ -667,7 +738,6 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { TII->moveToVALU(MI); break; - } case AMDGPU::INSERT_SUBREG: { const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; DstRC = MRI.getRegClass(MI.getOperand(0).getReg()); diff --git a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp new file mode 100644 index 000000000000..3493c7775f0c --- /dev/null +++ b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp @@ -0,0 +1,202 @@ +//===-- SIFixWWMLiveness.cpp - Fix WWM live intervals ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Computations in WWM can overwrite values in inactive channels for +/// variables that the register allocator thinks are dead. This pass adds fake +/// uses of those variables to WWM instructions to make sure that they aren't +/// overwritten. +/// +/// As an example, consider this snippet: +/// %vgpr0 = V_MOV_B32_e32 0.0 +/// if (...) { +/// %vgpr1 = ... +/// %vgpr2 = WWM killed %vgpr1 +/// ... = killed %vgpr2 +/// %vgpr0 = V_MOV_B32_e32 1.0 +/// } +/// ... = %vgpr0 +/// +/// The live intervals of %vgpr0 don't overlap with those of %vgpr1. Normally, +/// we can safely allocate %vgpr0 and %vgpr1 in the same register, since +/// writing %vgpr1 would only write to channels that would be clobbered by the +/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled, +/// it would clobber even the inactive channels for which the if-condition is +/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use +/// of %vgpr0 to the WWM instruction to make sure they aren't allocated to the +/// same register. +/// +/// In general, we need to figure out what registers might have their inactive +/// channels which are eventually used accidentally clobbered by a WWM +/// instruction. We approximate this using two conditions: +/// +/// 1. A definition of the variable reaches the WWM instruction. +/// 2. The variable would be live at the WWM instruction if all its defs were +/// partial defs (i.e. considered as a use), ignoring normal uses. +/// +/// If a register matches both conditions, then we add an implicit use of it to +/// the WWM instruction. Condition #2 is the heart of the matter: every +/// definition is really a partial definition, since every VALU instruction is +/// implicitly predicated. We can usually ignore this, but WWM forces us not +/// to. Condition #1 prevents false positives if the variable is undefined at +/// the WWM instruction anyways. This is overly conservative in certain cases, +/// especially in uniform control flow, but this is a workaround anyways until +/// LLVM gains the notion of predicated uses and definitions of variables. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SparseBitVector.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-fix-wwm-liveness" + +namespace { + +class SIFixWWMLiveness : public MachineFunctionPass { +private: + LiveIntervals *LIS = nullptr; + const SIRegisterInfo *TRI; + MachineRegisterInfo *MRI; + +public: + static char ID; + + SIFixWWMLiveness() : MachineFunctionPass(ID) { + initializeSIFixWWMLivenessPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + bool runOnWWMInstruction(MachineInstr &MI); + + void addDefs(const MachineInstr &MI, SparseBitVector<> &set); + + StringRef getPassName() const override { return "SI Fix WWM Liveness"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + // Should preserve the same set that TwoAddressInstructions does. + AU.addPreserved<SlotIndexes>(); + AU.addPreserved<LiveIntervals>(); + AU.addPreservedID(LiveVariablesID); + AU.addPreservedID(MachineLoopInfoID); + AU.addPreservedID(MachineDominatorsID); + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS(SIFixWWMLiveness, DEBUG_TYPE, + "SI fix WWM liveness", false, false) + +char SIFixWWMLiveness::ID = 0; + +char &llvm::SIFixWWMLivenessID = SIFixWWMLiveness::ID; + +FunctionPass *llvm::createSIFixWWMLivenessPass() { + return new SIFixWWMLiveness(); +} + +void SIFixWWMLiveness::addDefs(const MachineInstr &MI, SparseBitVector<> &Regs) +{ + for (const MachineOperand &Op : MI.defs()) { + if (Op.isReg()) { + unsigned Reg = Op.getReg(); + if (TRI->isVGPR(*MRI, Reg)) + Regs.set(Reg); + } + } +} + +bool SIFixWWMLiveness::runOnWWMInstruction(MachineInstr &WWM) { + MachineBasicBlock *MBB = WWM.getParent(); + + // Compute the registers that are live out of MI by figuring out which defs + // are reachable from MI. + SparseBitVector<> LiveOut; + + for (auto II = MachineBasicBlock::iterator(WWM), IE = + MBB->end(); II != IE; ++II) { + addDefs(*II, LiveOut); + } + + for (df_iterator<MachineBasicBlock *> I = ++df_begin(MBB), + E = df_end(MBB); + I != E; ++I) { + for (const MachineInstr &MI : **I) { + addDefs(MI, LiveOut); + } + } + + // Compute the registers that reach MI. + SparseBitVector<> Reachable; + + for (auto II = ++MachineBasicBlock::reverse_iterator(WWM), IE = + MBB->rend(); II != IE; ++II) { + addDefs(*II, Reachable); + } + + for (idf_iterator<MachineBasicBlock *> I = ++idf_begin(MBB), + E = idf_end(MBB); + I != E; ++I) { + for (const MachineInstr &MI : **I) { + addDefs(MI, Reachable); + } + } + + // find the intersection, and add implicit uses. + LiveOut &= Reachable; + + bool Modified = false; + for (unsigned Reg : LiveOut) { + WWM.addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true)); + if (LIS) { + // FIXME: is there a better way to update the live interval? + LIS->removeInterval(Reg); + LIS->createAndComputeVirtRegInterval(Reg); + } + Modified = true; + } + + return Modified; +} + +bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) { + bool Modified = false; + + // This doesn't actually need LiveIntervals, but we can preserve them. + LIS = getAnalysisIfAvailable<LiveIntervals>(); + + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + TRI = &TII->getRegisterInfo(); + MRI = &MF.getRegInfo(); + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() == AMDGPU::EXIT_WWM) { + Modified |= runOnWWMInstruction(MI); + } + } + } + + return Modified; +} diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index 0aad8f0843d6..783181980342 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -14,7 +14,7 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -290,11 +290,11 @@ void SIFoldOperands::foldOperand( // copy since a subregister use tied to a full register def doesn't really // make sense. e.g. don't fold: // - // %vreg1 = COPY %vreg0:sub1 - // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg1<tied0> + // %1 = COPY %0:sub1 + // %2<tied3> = V_MAC_{F16, F32} %3, %4, %1<tied0> // // into - // %vreg2<tied3> = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg0:sub1<tied0> + // %2<tied3> = V_MAC_{F16, F32} %3, %4, %0:sub1<tied0> if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister) return; } @@ -628,7 +628,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, MachineOperand *NonInlineUse = nullptr; int NonInlineUseOpNo = -1; - MachineRegisterInfo::use_iterator NextUse, NextInstUse; + MachineRegisterInfo::use_iterator NextUse; for (MachineRegisterInfo::use_iterator Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end(); Use != E; Use = NextUse) { @@ -723,12 +723,15 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, } } +// Clamp patterns are canonically selected to v_max_* instructions, so only +// handle them. const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { unsigned Op = MI.getOpcode(); switch (Op) { case AMDGPU::V_MAX_F32_e64: case AMDGPU::V_MAX_F16_e64: - case AMDGPU::V_MAX_F64: { + case AMDGPU::V_MAX_F64: + case AMDGPU::V_PK_MAX_F16: { if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm()) return nullptr; @@ -736,14 +739,24 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); if (!Src0->isReg() || !Src1->isReg() || + Src0->getReg() != Src1->getReg() || Src0->getSubReg() != Src1->getSubReg() || Src0->getSubReg() != AMDGPU::NoSubRegister) return nullptr; // Can't fold up if we have modifiers. - if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || - TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || - TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) + if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) + return nullptr; + + unsigned Src0Mods + = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm(); + unsigned Src1Mods + = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm(); + + // Having a 0 op_sel_hi would require swizzling the output in the source + // instruction, which we can't do. + unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 : 0; + if (Src0Mods != UnsetMods && Src1Mods != UnsetMods) return nullptr; return Src0; } @@ -765,14 +778,18 @@ static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) { return true; } +// FIXME: Clamp for v_mad_mixhi_f16 handled during isel. bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { const MachineOperand *ClampSrc = isClamp(MI); if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg())) return false; MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg()); - if (!TII->hasFPClamp(*Def)) + + // The type of clamp must be compatible. + if (TII->getClampMask(*Def) != TII->getClampMask(MI)) return false; + MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp); if (!DefClamp) return false; @@ -909,7 +926,7 @@ bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) { } bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; MRI = &MF.getRegInfo(); @@ -954,9 +971,9 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { // Prevent folding operands backwards in the function. For example, // the COPY opcode must not be replaced by 1 in this example: // - // %vreg3<def> = COPY %VGPR0; VGPR_32:%vreg3 + // %3 = COPY %vgpr0; VGPR_32:%3 // ... - // %VGPR0<def> = V_MOV_B32_e32 1, %EXEC<imp-use> + // %vgpr0 = V_MOV_B32_e32 1, implicit %exec MachineOperand &Dst = MI.getOperand(0); if (Dst.isReg() && !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index 7334781916d8..89bb98dbd028 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -38,6 +38,7 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST, MachineBasicBlock &MBB) const { const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo* TRI = &TII->getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // We don't need this if we only have spills since there is no user facing // scratch. @@ -55,7 +56,7 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST, MachineBasicBlock::iterator I = MBB.begin(); unsigned FlatScratchInitReg - = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT); + = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); MachineRegisterInfo &MRI = MF.getRegInfo(); MRI.addLiveIn(FlatScratchInitReg); @@ -64,7 +65,6 @@ void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST, unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); - const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); // Do a 64-bit pointer add. @@ -219,7 +219,6 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was // specified. const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - auto AMDGPUASI = ST.getAMDGPUAS(); if (ST.debuggerEmitPrologue()) emitDebuggerPrologue(MF, MBB); @@ -283,13 +282,13 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, } // We need to insert initialization of the scratch resource descriptor. - unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( + AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; if (ST.isAmdCodeObjectV2(MF)) { - PreloadedPrivateBufferReg = TRI->getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + PreloadedPrivateBufferReg = MFI->getPreloadedReg( + AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); } bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg); @@ -356,7 +355,64 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, .addReg(PreloadedPrivateBufferReg, RegState::Kill); } - if (ResourceRegUsed && (ST.isMesaGfxShader(MF) || (PreloadedPrivateBufferReg == AMDGPU::NoRegister))) { + if (ResourceRegUsed) + emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I, + PreloadedPrivateBufferReg, ScratchRsrcReg); +} + +// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. +void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST, + MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI, + MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg, + unsigned ScratchRsrcReg) const { + + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + DebugLoc DL; + + if (ST.isAmdPalOS()) { + // The pointer to the GIT is formed from the offset passed in and either + // the amdgpu-git-ptr-high function attribute or the top part of the PC + unsigned RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + unsigned RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); + + const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); + + if (MFI->getGITPtrHigh() != 0xffffffff) { + BuildMI(MBB, I, DL, SMovB32, RsrcHi) + .addImm(MFI->getGITPtrHigh()) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + } else { + const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64); + BuildMI(MBB, I, DL, GetPC64, Rsrc01); + } + BuildMI(MBB, I, DL, SMovB32, RsrcLo) + .addReg(AMDGPU::SGPR0) // Low address passed in + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + // We now have the GIT ptr - now get the scratch descriptor from the entry + // at offset 0. + PointerType *PtrTy = + PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()), + AMDGPUAS::CONSTANT_ADDRESS); + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); + const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM); + auto MMO = MF.getMachineMemOperand(PtrInfo, + MachineMemOperand::MOLoad | + MachineMemOperand::MOInvariant | + MachineMemOperand::MODereferenceable, + 0, 0); + BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg) + .addReg(Rsrc01) + .addImm(0) // offset + .addImm(0) // glc + .addReg(ScratchRsrcReg, RegState::ImplicitDefine) + .addMemOperand(MMO); + return; + } + if (ST.isMesaGfxShader(MF) + || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) { assert(!ST.isAmdCodeObjectV2(MF)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); @@ -369,7 +425,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, if (MFI->hasImplicitBufferPtr()) { unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); - if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { + if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64); BuildMI(MBB, I, DL, Mov64, Rsrc01) @@ -379,8 +435,8 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM); PointerType *PtrTy = - PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()), - AMDGPUASI.CONSTANT_ADDRESS); + PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()), + AMDGPUAS::CONSTANT_ADDRESS); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); auto MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad | @@ -454,6 +510,15 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, .addImm(NumBytes * ST.getWavefrontSize()) .setMIFlag(MachineInstr::FrameSetup); } + + for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg + : FuncInfo->getSGPRSpillVGPRs()) { + if (!Reg.FI.hasValue()) + continue; + TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true, + Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, + &TII->getRegisterInfo()); + } } void SIFrameLowering::emitEpilogue(MachineFunction &MF, @@ -462,6 +527,19 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, if (FuncInfo->isEntryFunction()) return; + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + + for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg + : FuncInfo->getSGPRSpillVGPRs()) { + if (!Reg.FI.hasValue()) + continue; + TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR, + Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass, + &TII->getRegisterInfo()); + } + unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg(); if (StackPtrReg == AMDGPU::NoRegister) return; @@ -469,9 +547,6 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF, const MachineFrameInfo &MFI = MF.getFrameInfo(); uint32_t NumBytes = MFI.getStackSize(); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - const SIInstrInfo *TII = ST.getInstrInfo(); - MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); DebugLoc DL; // FIXME: Clarify distinction between no set SP and SP. For callee functions, @@ -575,6 +650,50 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( } } +void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + // The SP is specifically managed and we don't want extra spills of it. + SavedRegs.reset(MFI->getStackPtrOffsetReg()); +} + +MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( + MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + int64_t Amount = I->getOperand(0).getImm(); + if (Amount == 0) + return MBB.erase(I); + + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const DebugLoc &DL = I->getDebugLoc(); + unsigned Opc = I->getOpcode(); + bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); + uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; + + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + if (!TFI->hasReservedCallFrame(MF)) { + unsigned Align = getStackAlignment(); + + Amount = alignTo(Amount, Align); + assert(isUInt<32>(Amount) && "exceeded stack address space size"); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + unsigned SPReg = MFI->getStackPtrOffsetReg(); + + unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; + BuildMI(MBB, I, DL, TII->get(Op), SPReg) + .addReg(SPReg) + .addImm(Amount * ST.getWavefrontSize()); + } else if (CalleePopAmount != 0) { + llvm_unreachable("is this used?"); + } + + return MBB.erase(I); +} + void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h index d4dfa1c7eaa8..df6f1632a316 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.h +++ b/lib/Target/AMDGPU/SIFrameLowering.h @@ -35,10 +35,18 @@ public: int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS = nullptr) const override; + void processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS = nullptr) const override; + MachineBasicBlock::iterator + eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const override; + private: void emitFlatScratchInit(const SISubtarget &ST, MachineFunction &MF, @@ -61,6 +69,12 @@ private: /// \brief Emits debugger prologue. void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const; + // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. + void emitEntryFunctionScratchSetup(const SISubtarget &ST, MachineFunction &MF, + MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI, + MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg, + unsigned ScratchRsrcReg) const; + public: bool hasFP(const MachineFunction &MF) const override; bool hasSP(const MachineFunction &MF) const; diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 2356405f0919..50ee88fa635a 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -32,6 +32,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" @@ -45,11 +46,14 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/TargetCallingConv.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -70,9 +74,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Target/TargetCallingConv.h" #include "llvm/Target/TargetOptions.h" -#include "llvm/Target/TargetRegisterInfo.h" #include <cassert> #include <cmath> #include <cstdint> @@ -83,11 +85,21 @@ using namespace llvm; +#define DEBUG_TYPE "si-lower" + +STATISTIC(NumTailCalls, "Number of tail calls"); + static cl::opt<bool> EnableVGPRIndexMode( "amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false)); +static cl::opt<unsigned> AssumeFrameIndexHighZeroBits( + "amdgpu-frame-index-zero-bits", + cl::desc("High bits of frame index assumed to be zero"), + cl::init(5), + cl::ReallyHidden); + static unsigned findFirstFreeSGPR(CCState &CCInfo) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { @@ -214,6 +226,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::ADDCARRY, MVT::i32, Legal); setOperationAction(ISD::SUBCARRY, MVT::i32, Legal); +#if 0 + setOperationAction(ISD::ADDCARRY, MVT::i64, Legal); + setOperationAction(ISD::SUBCARRY, MVT::i64, Legal); +#endif + + //setOperationAction(ISD::ADDC, MVT::i64, Expand); + //setOperationAction(ISD::SUBC, MVT::i64, Expand); + // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, @@ -462,6 +482,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand); setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand); setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); @@ -496,6 +517,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::BUILD_VECTOR); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -528,8 +550,7 @@ const SISubtarget *SITargetLowering::getSubtarget() const { // TargetLowering queries //===----------------------------------------------------------------------===// -bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, - EVT) const { +bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { // SI has some legal vector types, but no legal vector operations. Say no // shuffles are legal in order to prefer scalarizing some vector operations. return false; @@ -537,6 +558,7 @@ bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, + MachineFunction &MF, unsigned IntrID) const { switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: @@ -545,11 +567,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); Info.align = 0; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4)); - Info.vol = !Vol || !Vol->isZero(); - Info.readMem = true; - Info.writeMem = true; + if (!Vol || !Vol->isZero()) + Info.flags |= MachineMemOperand::MOVolatile; + return true; } default: @@ -587,6 +610,26 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { return isUInt<12>(AM.BaseOffs) && AM.Scale == 0; } +bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const { + if (Subtarget->hasFlatGlobalInsts()) + return isInt<13>(AM.BaseOffs) && AM.Scale == 0; + + if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) { + // Assume the we will use FLAT for all global memory accesses + // on VI. + // FIXME: This assumption is currently wrong. On VI we still use + // MUBUF instructions for the r + i addressing mode. As currently + // implemented, the MUBUF instructions only work on buffer < 4GB. + // It may be possible to support > 4GB buffers with MUBUF instructions, + // by setting the stride value in the resource descriptor which would + // increase the size limit to (stride * 4GB). However, this is risky, + // because it has never been validated. + return isLegalFlatAddressingMode(AM); + } + + return isLegalMUBUFAddressingMode(AM); +} + bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and // additionally can do r + r + i with addr64. 32-bit has more addressing @@ -624,27 +667,15 @@ bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, - unsigned AS) const { + unsigned AS, Instruction *I) const { // No global is ever allowed as a base. if (AM.BaseGV) return false; - if (AS == AMDGPUASI.GLOBAL_ADDRESS) { - if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { - // Assume the we will use FLAT for all global memory accesses - // on VI. - // FIXME: This assumption is currently wrong. On VI we still use - // MUBUF instructions for the r + i addressing mode. As currently - // implemented, the MUBUF instructions only work on buffer < 4GB. - // It may be possible to support > 4GB buffers with MUBUF instructions, - // by setting the stride value in the resource descriptor which would - // increase the size limit to (stride * 4GB). However, this is risky, - // because it has never been validated. - return isLegalFlatAddressingMode(AM); - } + if (AS == AMDGPUASI.GLOBAL_ADDRESS) + return isLegalGlobalAddressingMode(AM); - return isLegalMUBUFAddressingMode(AM); - } else if (AS == AMDGPUASI.CONSTANT_ADDRESS) { + if (AS == AMDGPUASI.CONSTANT_ADDRESS) { // If the offset isn't a multiple of 4, it probably isn't going to be // correctly aligned. // FIXME: Can we get the real alignment here? @@ -656,7 +687,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, // FIXME?: We also need to do this if unaligned, but we don't know the // alignment here. if (DL.getTypeStoreSize(Ty) < 4) - return isLegalMUBUFAddressingMode(AM); + return isLegalGlobalAddressingMode(AM); if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { // SMRD instructions have an 8-bit, dword offset on SI. @@ -888,18 +919,30 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, uint64_t Offset) const { const DataLayout &DL = DAG.getDataLayout(); MachineFunction &MF = DAG.getMachineFunction(); - const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); - unsigned InputPtrReg = TRI->getPreloadedValue(MF, - SIRegisterInfo::KERNARG_SEGMENT_PTR); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + const ArgDescriptor *InputPtrReg; + const TargetRegisterClass *RC; + + std::tie(InputPtrReg, RC) + = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS); SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, - MRI.getLiveInVirtReg(InputPtrReg), PtrVT); + MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT); + return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, DAG.getConstant(Offset, SL, PtrVT)); } +SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG, + const SDLoc &SL) const { + auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>(); + uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); + return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset); +} + SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val, bool Signed, @@ -991,6 +1034,17 @@ SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA return ArgValue; } +SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG, + const SIMachineFunctionInfo &MFI, + EVT VT, + AMDGPUFunctionArgInfo::PreloadedValue PVID) const { + const ArgDescriptor *Reg; + const TargetRegisterClass *RC; + + std::tie(Reg, RC) = MFI.getPreloadedValue(PVID); + return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT); +} + static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, CallingConv::ID CallConv, ArrayRef<ISD::InputArg> Ins, @@ -1041,29 +1095,131 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, } // Allocate special inputs passed in VGPRs. -static void allocateSpecialInputVGPRs(CCState &CCInfo, - MachineFunction &MF, - const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) { +static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { if (Info.hasWorkItemIDX()) { - unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); + unsigned Reg = AMDGPU::VGPR0; MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg)); } if (Info.hasWorkItemIDY()) { - unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); + unsigned Reg = AMDGPU::VGPR1; MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); } if (Info.hasWorkItemIDZ()) { - unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); + unsigned Reg = AMDGPU::VGPR2; MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); } } +// Try to allocate a VGPR at the end of the argument list, or if no argument +// VGPRs are left allocating a stack slot. +static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) { + ArrayRef<MCPhysReg> ArgVGPRs + = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32); + unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs); + if (RegIdx == ArgVGPRs.size()) { + // Spill to stack required. + int64_t Offset = CCInfo.AllocateStack(4, 4); + + return ArgDescriptor::createStack(Offset); + } + + unsigned Reg = ArgVGPRs[RegIdx]; + Reg = CCInfo.AllocateReg(Reg); + assert(Reg != AMDGPU::NoRegister); + + MachineFunction &MF = CCInfo.getMachineFunction(); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + return ArgDescriptor::createRegister(Reg); +} + +static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, + const TargetRegisterClass *RC, + unsigned NumArgRegs) { + ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32); + unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs); + if (RegIdx == ArgSGPRs.size()) + report_fatal_error("ran out of SGPRs for arguments"); + + unsigned Reg = ArgSGPRs[RegIdx]; + Reg = CCInfo.AllocateReg(Reg); + assert(Reg != AMDGPU::NoRegister); + + MachineFunction &MF = CCInfo.getMachineFunction(); + MF.addLiveIn(Reg, RC); + return ArgDescriptor::createRegister(Reg); +} + +static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) { + return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32); +} + +static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) { + return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); +} + +static void allocateSpecialInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { + if (Info.hasWorkItemIDX()) + Info.setWorkItemIDX(allocateVGPR32Input(CCInfo)); + + if (Info.hasWorkItemIDY()) + Info.setWorkItemIDY(allocateVGPR32Input(CCInfo)); + + if (Info.hasWorkItemIDZ()) + Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo)); +} + +static void allocateSpecialInputSGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { + auto &ArgInfo = Info.getArgInfo(); + + // TODO: Unify handling with private memory pointers. + + if (Info.hasDispatchPtr()) + ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo); + + if (Info.hasQueuePtr()) + ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo); + + if (Info.hasKernargSegmentPtr()) + ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo); + + if (Info.hasDispatchID()) + ArgInfo.DispatchID = allocateSGPR64Input(CCInfo); + + // flat_scratch_init is not applicable for non-kernel functions. + + if (Info.hasWorkGroupIDX()) + ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo); + + if (Info.hasWorkGroupIDY()) + ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo); + + if (Info.hasWorkGroupIDZ()) + ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo); + + if (Info.hasImplicitArgPtr()) + ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo); +} + // Allocate special inputs passed in user SGPRs. static void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, @@ -1187,20 +1343,38 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, if (TM.getOptLevel() == CodeGenOpt::None) HasStackObjects = true; + // For now assume stack access is needed in any callee functions, so we need + // the scratch registers to pass in. + bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); if (ST.isAmdCodeObjectV2(MF)) { - if (HasStackObjects) { + if (RequiresStackAccess) { // If we have stack objects, we unquestionably need the private buffer // resource. For the Code Object V2 ABI, this will be the first 4 user // SGPR inputs. We can reserve those and use them directly. - unsigned PrivateSegmentBufferReg = TRI.getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + unsigned PrivateSegmentBufferReg = Info.getPreloadedReg( + AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); Info.setScratchRSrcReg(PrivateSegmentBufferReg); - unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); + if (MFI.hasCalls()) { + // If we have calls, we need to keep the frame register in a register + // that won't be clobbered by a call, so ensure it is copied somewhere. + + // This is not a problem for the scratch wave offset, because the same + // registers are reserved in all functions. + + // FIXME: Nothing is really ensuring this is a call preserved register, + // it's just selected from the end so it happens to be. + unsigned ReservedOffsetReg + = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); + Info.setScratchWaveOffsetReg(ReservedOffsetReg); + } else { + unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg( + AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); + } } else { unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); @@ -1223,9 +1397,9 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // offset is still in an input SGPR. Info.setScratchRSrcReg(ReservedBufferReg); - if (HasStackObjects) { - unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + if (HasStackObjects && !MFI.hasCalls()) { + unsigned ScratchWaveOffsetReg = Info.getPreloadedReg( + AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg); } else { unsigned ReservedOffsetReg @@ -1235,6 +1409,50 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, } } +bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const { + const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); + return !Info->isEntryFunction(); +} + +void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { + +} + +void SITargetLowering::insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl<MachineBasicBlock *> &Exits) const { + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); + + const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); + if (!IStart) + return; + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + MachineBasicBlock::iterator MBBI = Entry->begin(); + for (const MCPhysReg *I = IStart; *I; ++I) { + const TargetRegisterClass *RC = nullptr; + if (AMDGPU::SReg_64RegClass.contains(*I)) + RC = &AMDGPU::SGPR_64RegClass; + else if (AMDGPU::SReg_32RegClass.contains(*I)) + RC = &AMDGPU::SGPR_32RegClass; + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + + unsigned NewVR = MRI->createVirtualRegister(RC); + // Create copy from CSR to a virtual register. + Entry->addLiveIn(*I); + BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) + .addReg(*I); + + // Insert the copy-back instructions right before the terminator. + for (auto *Exit : Exits) + BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), + TII->get(TargetOpcode::COPY), *I) + .addReg(NewVR); + } +} + SDValue SITargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, @@ -1242,14 +1460,14 @@ SDValue SITargetLowering::LowerFormalArguments( const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); MachineFunction &MF = DAG.getMachineFunction(); - FunctionType *FType = MF.getFunction()->getFunctionType(); + FunctionType *FType = MF.getFunction().getFunctionType(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { - const Function *Fn = MF.getFunction(); + const Function &Fn = MF.getFunction(); DiagnosticInfoUnsupported NoGraphicsHSA( - *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); + Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); DAG.getContext()->diagnose(NoGraphicsHSA); return DAG.getEntryNode(); } @@ -1269,6 +1487,12 @@ SDValue SITargetLowering::LowerFormalArguments( bool IsKernel = AMDGPU::isKernel(CallConv); bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv); + if (!IsEntryFunc) { + // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over + // this when allocating argument fixed offsets. + CCInfo.AllocateStack(4, 4); + } + if (IsShader) { processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); @@ -1285,14 +1509,31 @@ SDValue SITargetLowering::LowerFormalArguments( // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be // enabled too. - if (CallConv == CallingConv::AMDGPU_PS && - ((Info->getPSInputAddr() & 0x7F) == 0 || - ((Info->getPSInputAddr() & 0xF) == 0 && - Info->isPSInputAllocated(11)))) { - CCInfo.AllocateReg(AMDGPU::VGPR0); - CCInfo.AllocateReg(AMDGPU::VGPR1); - Info->markPSInputAllocated(0); - Info->markPSInputEnabled(0); + if (CallConv == CallingConv::AMDGPU_PS) { + if ((Info->getPSInputAddr() & 0x7F) == 0 || + ((Info->getPSInputAddr() & 0xF) == 0 && + Info->isPSInputAllocated(11))) { + CCInfo.AllocateReg(AMDGPU::VGPR0); + CCInfo.AllocateReg(AMDGPU::VGPR1); + Info->markPSInputAllocated(0); + Info->markPSInputEnabled(0); + } + if (Subtarget->isAmdPalOS()) { + // For isAmdPalOS, the user does not enable some bits after compilation + // based on run-time states; the register values being generated here are + // the final ones set in hardware. Therefore we need to apply the + // workaround to PSInputAddr and PSInputEnable together. (The case where + // a bit is set in PSInputAddr but not PSInputEnable is where the + // frontend set up an input arg for a particular interpolation mode, but + // nothing uses that input arg. Really we should have an earlier pass + // that removes such an arg.) + unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); + if ((PsInputBits & 0x7F) == 0 || + ((PsInputBits & 0xF) == 0 && + (PsInputBits >> 11 & 1))) + Info->markPSInputEnabled( + countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); + } } assert(!Info->hasDispatchPtr() && @@ -1308,7 +1549,7 @@ SDValue SITargetLowering::LowerFormalArguments( } if (IsEntryFunc) { - allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); + allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); } @@ -1375,6 +1616,17 @@ SDValue SITargetLowering::LowerFormalArguments( Reg = MF.addLiveIn(Reg, RC); SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); + if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) { + // The return object should be reasonably addressable. + + // FIXME: This helps when the return is a real sret. If it is a + // automatically inserted sret (i.e. CanLowerReturn returns false), an + // extra copy is inserted in SelectionDAGBuilder which obscures this. + unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits; + Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, + DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits))); + } + // If this is an 8 or 16-bit value, it is really passed promoted // to 32 bits. Insert an assert[sz]ext to capture this, then // truncate to the right size. @@ -1427,6 +1679,11 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Val); } + if (!IsEntryFunc) { + // Special inputs come after user arguments. + allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); + } + // Start adding system SGPRs. if (IsEntryFunc) { allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader); @@ -1434,8 +1691,16 @@ SDValue SITargetLowering::LowerFormalArguments( CCInfo.AllocateReg(Info->getScratchRSrcReg()); CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); CCInfo.AllocateReg(Info->getFrameOffsetReg()); + allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } + auto &ArgUsageInfo = + DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); + ArgUsageInfo.setFuncArgInfo(MF.getFunction(), Info->getArgInfo()); + + unsigned StackArgSize = CCInfo.getNextStackOffset(); + Info->setBytesInStackArgArea(StackArgSize); + return Chains.empty() ? Chain : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } @@ -1575,6 +1840,22 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, } // FIXME: Does sret work properly? + if (!Info->isEntryFunction()) { + const SIRegisterInfo *TRI + = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + if (AMDGPU::SReg_64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i64)); + else if (AMDGPU::SReg_32RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i32)); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } + } // Update chain and glue. RetOps[0] = Chain; @@ -1587,6 +1868,563 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, return DAG.getNode(Opc, DL, MVT::Other, RetOps); } +SDValue SITargetLowering::LowerCallResult( + SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn, + SDValue ThisVal) const { + CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg); + + // Assign locations to each value returned by this call. + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); + CCInfo.AnalyzeCallResult(Ins, RetCC); + + // Copy all of the result registers out of their specified physreg. + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign VA = RVLocs[i]; + SDValue Val; + + if (VA.isRegLoc()) { + Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); + Chain = Val.getValue(1); + InFlag = Val.getValue(2); + } else if (VA.isMemLoc()) { + report_fatal_error("TODO: return values in memory"); + } else + llvm_unreachable("unknown argument location type"); + + switch (VA.getLocInfo()) { + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); + break; + case CCValAssign::ZExt: + Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val, + DAG.getValueType(VA.getValVT())); + Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val); + break; + case CCValAssign::SExt: + Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val, + DAG.getValueType(VA.getValVT())); + Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val); + break; + case CCValAssign::AExt: + Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val); + break; + default: + llvm_unreachable("Unknown loc info!"); + } + + InVals.push_back(Val); + } + + return Chain; +} + +// Add code to pass special inputs required depending on used features separate +// from the explicit user arguments present in the IR. +void SITargetLowering::passSpecialInputs( + CallLoweringInfo &CLI, + const SIMachineFunctionInfo &Info, + SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, + SmallVectorImpl<SDValue> &MemOpChains, + SDValue Chain, + SDValue StackPtr) const { + // If we don't have a call site, this was a call inserted by + // legalization. These can never use special inputs. + if (!CLI.CS) + return; + + const Function *CalleeFunc = CLI.CS.getCalledFunction(); + assert(CalleeFunc); + + SelectionDAG &DAG = CLI.DAG; + const SDLoc &DL = CLI.DL; + + const SISubtarget *ST = getSubtarget(); + const SIRegisterInfo *TRI = ST->getRegisterInfo(); + + auto &ArgUsageInfo = + DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); + const AMDGPUFunctionArgInfo &CalleeArgInfo + = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc); + + const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo(); + + // TODO: Unify with private memory register handling. This is complicated by + // the fact that at least in kernels, the input argument is not necessarily + // in the same location as the input. + AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = { + AMDGPUFunctionArgInfo::DISPATCH_PTR, + AMDGPUFunctionArgInfo::QUEUE_PTR, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR, + AMDGPUFunctionArgInfo::DISPATCH_ID, + AMDGPUFunctionArgInfo::WORKGROUP_ID_X, + AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, + AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, + AMDGPUFunctionArgInfo::WORKITEM_ID_X, + AMDGPUFunctionArgInfo::WORKITEM_ID_Y, + AMDGPUFunctionArgInfo::WORKITEM_ID_Z, + AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR + }; + + for (auto InputID : InputRegs) { + const ArgDescriptor *OutgoingArg; + const TargetRegisterClass *ArgRC; + + std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID); + if (!OutgoingArg) + continue; + + const ArgDescriptor *IncomingArg; + const TargetRegisterClass *IncomingArgRC; + std::tie(IncomingArg, IncomingArgRC) + = CallerArgInfo.getPreloadedValue(InputID); + assert(IncomingArgRC == ArgRC); + + // All special arguments are ints for now. + EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32; + SDValue InputReg; + + if (IncomingArg) { + InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg); + } else { + // The implicit arg ptr is special because it doesn't have a corresponding + // input for kernels, and is computed from the kernarg segment pointer. + assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); + InputReg = getImplicitArgPtr(DAG, DL); + } + + if (OutgoingArg->isRegister()) { + RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); + } else { + SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr, + InputReg, + OutgoingArg->getStackOffset()); + MemOpChains.push_back(ArgStore); + } + } +} + +static bool canGuaranteeTCO(CallingConv::ID CC) { + return CC == CallingConv::Fast; +} + +/// Return true if we might ever do TCO for calls with this calling convention. +static bool mayTailCallThisCC(CallingConv::ID CC) { + switch (CC) { + case CallingConv::C: + return true; + default: + return canGuaranteeTCO(CC); + } +} + +bool SITargetLowering::isEligibleForTailCallOptimization( + SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { + if (!mayTailCallThisCC(CalleeCC)) + return false; + + MachineFunction &MF = DAG.getMachineFunction(); + const Function &CallerF = MF.getFunction(); + CallingConv::ID CallerCC = CallerF.getCallingConv(); + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); + const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); + + // Kernels aren't callable, and don't have a live in return address so it + // doesn't make sense to do a tail call with entry functions. + if (!CallerPreserved) + return false; + + bool CCMatch = CallerCC == CalleeCC; + + if (DAG.getTarget().Options.GuaranteedTailCallOpt) { + if (canGuaranteeTCO(CalleeCC) && CCMatch) + return true; + return false; + } + + // TODO: Can we handle var args? + if (IsVarArg) + return false; + + for (const Argument &Arg : CallerF.args()) { + if (Arg.hasByValAttr()) + return false; + } + + LLVMContext &Ctx = *DAG.getContext(); + + // Check that the call results are passed in the same way. + if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins, + CCAssignFnForCall(CalleeCC, IsVarArg), + CCAssignFnForCall(CallerCC, IsVarArg))) + return false; + + // The callee has to preserve all registers the caller needs to preserve. + if (!CCMatch) { + const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); + if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) + return false; + } + + // Nothing more to check if the callee is taking no arguments. + if (Outs.empty()) + return true; + + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx); + + CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg)); + + const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); + // If the stack arguments for this call do not fit into our own save area then + // the call cannot be made tail. + // TODO: Is this really necessary? + if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) + return false; + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals); +} + +bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { + if (!CI->isTailCall()) + return false; + + const Function *ParentFn = CI->getParent()->getParent(); + if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv())) + return false; + + auto Attr = ParentFn->getFnAttribute("disable-tail-calls"); + return (Attr.getValueAsString() != "true"); +} + +// The wave scratch offset register is used as the global base pointer. +SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + const SDLoc &DL = CLI.DL; + SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; + SmallVector<SDValue, 32> &OutVals = CLI.OutVals; + SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &IsTailCall = CLI.IsTailCall; + CallingConv::ID CallConv = CLI.CallConv; + bool IsVarArg = CLI.IsVarArg; + bool IsSibCall = false; + bool IsThisReturn = false; + MachineFunction &MF = DAG.getMachineFunction(); + + if (IsVarArg) { + return lowerUnhandledCall(CLI, InVals, + "unsupported call to variadic function "); + } + + if (!CLI.CS.getCalledFunction()) { + return lowerUnhandledCall(CLI, InVals, + "unsupported indirect call to function "); + } + + if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) { + return lowerUnhandledCall(CLI, InVals, + "unsupported required tail call to function "); + } + + // The first 4 bytes are reserved for the callee's emergency stack slot. + const unsigned CalleeUsableStackOffset = 4; + + if (IsTailCall) { + IsTailCall = isEligibleForTailCallOptimization( + Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); + if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) { + report_fatal_error("failed to perform tail call elimination on a call " + "site marked musttail"); + } + + bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; + + // A sibling call is one where we're under the usual C ABI and not planning + // to change that but can still do a tail call: + if (!TailCallOpt && IsTailCall) + IsSibCall = true; + + if (IsTailCall) + ++NumTailCalls; + } + + if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) { + // FIXME: Remove this hack for function pointer types after removing + // support of old address space mapping. In the new address space + // mapping the pointer in default address space is 64 bit, therefore + // does not need this hack. + if (Callee.getValueType() == MVT::i32) { + const GlobalValue *GV = GA->getGlobal(); + Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(), false, + GA->getTargetFlags()); + } + } + assert(Callee.getValueType() == MVT::i64); + + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); + CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); + CCInfo.AnalyzeCallOperands(Outs, AssignFn); + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = CCInfo.getNextStackOffset(); + + if (IsSibCall) { + // Since we're not changing the ABI to make this a tail call, the memory + // operands are already available in the caller's incoming argument space. + NumBytes = 0; + } + + // FPDiff is the byte offset of the call's argument area from the callee's. + // Stores to callee stack arguments will be placed in FixedStackSlots offset + // by this amount for a tail call. In a sibling call it must be 0 because the + // caller will deallocate the entire stack and the callee still expects its + // arguments to begin at SP+0. Completely unused for non-tail calls. + int32_t FPDiff = 0; + MachineFrameInfo &MFI = MF.getFrameInfo(); + SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + + SDValue CallerSavedFP; + + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog pass + if (!IsSibCall) { + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); + + unsigned OffsetReg = Info->getScratchWaveOffsetReg(); + + // In the HSA case, this should be an identity copy. + SDValue ScratchRSrcReg + = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); + RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); + + // TODO: Don't hardcode these registers and get from the callee function. + SDValue ScratchWaveOffsetReg + = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32); + RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg); + + if (!Info->isEntryFunction()) { + // Avoid clobbering this function's FP value. In the current convention + // callee will overwrite this, so do save/restore around the call site. + CallerSavedFP = DAG.getCopyFromReg(Chain, DL, + Info->getFrameOffsetReg(), MVT::i32); + } + } + + // Stack pointer relative accesses are done by changing the offset SGPR. This + // is just the VGPR offset component. + SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32); + + SmallVector<SDValue, 8> MemOpChains; + MVT PtrVT = MVT::i32; + + // Walk the register/memloc assignments, inserting copies/loads. + for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; + ++i, ++realArgIdx) { + CCValAssign &VA = ArgLocs[i]; + SDValue Arg = OutVals[realArgIdx]; + + // Promote the value if needed. + switch (VA.getLocInfo()) { + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::FPExt: + Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); + break; + default: + llvm_unreachable("Unknown loc info!"); + } + + if (VA.isRegLoc()) { + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + } else { + assert(VA.isMemLoc()); + + SDValue DstAddr; + MachinePointerInfo DstInfo; + + unsigned LocMemOffset = VA.getLocMemOffset(); + int32_t Offset = LocMemOffset; + + SDValue PtrOff = DAG.getObjectPtrOffset(DL, StackPtr, Offset); + + if (IsTailCall) { + ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; + unsigned OpSize = Flags.isByVal() ? + Flags.getByValSize() : VA.getValVT().getStoreSize(); + + Offset = Offset + FPDiff; + int FI = MFI.CreateFixedObject(OpSize, Offset, true); + + DstAddr = DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT), + StackPtr); + DstInfo = MachinePointerInfo::getFixedStack(MF, FI); + + // Make sure any stack arguments overlapping with where we're storing + // are loaded before this eventual operation. Otherwise they'll be + // clobbered. + + // FIXME: Why is this really necessary? This seems to just result in a + // lot of code to copy the stack and write them back to the same + // locations, which are supposed to be immutable? + Chain = addTokenForArgument(Chain, DAG, MFI, FI); + } else { + DstAddr = PtrOff; + DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset); + } + + if (Outs[i].Flags.isByVal()) { + SDValue SizeNode = + DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32); + SDValue Cpy = DAG.getMemcpy( + Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), + /*isVol = */ false, /*AlwaysInline = */ true, + /*isTailCall = */ false, DstInfo, + MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy( + *DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS)))); + + MemOpChains.push_back(Cpy); + } else { + SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo); + MemOpChains.push_back(Store); + } + } + } + + // Copy special input registers after user input arguments. + passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr); + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into the appropriate regs. + SDValue InFlag; + for (auto &RegToPass : RegsToPass) { + Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, + RegToPass.second, InFlag); + InFlag = Chain.getValue(1); + } + + + SDValue PhysReturnAddrReg; + if (IsTailCall) { + // Since the return is being combined with the call, we need to pass on the + // return address. + + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); + SDValue ReturnAddrReg = CreateLiveInRegister( + DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64); + + PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF), + MVT::i64); + Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag); + InFlag = Chain.getValue(1); + } + + // We don't usually want to end the call-sequence here because we would tidy + // the frame up *after* the call, however in the ABI-changing tail-call case + // we've carefully laid out the parameters so that when sp is reset they'll be + // in the correct location. + if (IsTailCall && !IsSibCall) { + Chain = DAG.getCALLSEQ_END(Chain, + DAG.getTargetConstant(NumBytes, DL, MVT::i32), + DAG.getTargetConstant(0, DL, MVT::i32), + InFlag, DL); + InFlag = Chain.getValue(1); + } + + std::vector<SDValue> Ops; + Ops.push_back(Chain); + Ops.push_back(Callee); + + if (IsTailCall) { + // Each tail call may have to adjust the stack by a different amount, so + // this information must travel along with the operation for eventual + // consumption by emitEpilogue. + Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); + + Ops.push_back(PhysReturnAddrReg); + } + + // Add argument registers to the end of the list so that they are known live + // into the call. + for (auto &RegToPass : RegsToPass) { + Ops.push_back(DAG.getRegister(RegToPass.first, + RegToPass.second.getValueType())); + } + + // Add a register mask operand representing the call-preserved registers. + + const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + + if (InFlag.getNode()) + Ops.push_back(InFlag); + + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + + // If we're doing a tall call, use a TC_RETURN here rather than an + // actual call instruction. + if (IsTailCall) { + MFI.setHasTailCall(); + return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops); + } + + // Returns a chain and a flag for retval copy to use. + SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops); + Chain = Call.getValue(0); + InFlag = Call.getValue(1); + + if (CallerSavedFP) { + SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32); + Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag); + InFlag = Chain.getValue(1); + } + + uint64_t CalleePopBytes = NumBytes; + Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32), + DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32), + InFlag, DL); + if (!Ins.empty()) + InFlag = Chain.getValue(1); + + // Handle result values, copying them out of physregs into vregs that we + // return. + return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, + InVals, IsThisReturn, + IsThisReturn ? OutVals[0] : SDValue()); +} + unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const { unsigned Reg = StringSwitch<unsigned>(RegName) @@ -1644,7 +2482,7 @@ MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, if (SplitPoint == BB->end()) { // Don't bother with a new block. - MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode())); return BB; } @@ -1658,7 +2496,7 @@ MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, SplitBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(SplitBB); - MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode())); return SplitBB; } @@ -1775,8 +2613,8 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock::iterator I(&MI); unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec); @@ -2121,19 +2959,66 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( if (MI.mayLoad()) Flags |= MachineMemOperand::MOLoad; - auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0); - MI.addMemOperand(*MF, MMO); + if (Flags != MachineMemOperand::MODereferenceable) { + auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0); + MI.addMemOperand(*MF, MMO); + } + return BB; } switch (MI.getOpcode()) { - case AMDGPU::SI_INIT_M0: + case AMDGPU::S_ADD_U64_PSEUDO: + case AMDGPU::S_SUB_U64_PSEUDO: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + + MachineOperand &Dest = MI.getOperand(0); + MachineOperand &Src0 = MI.getOperand(1); + MachineOperand &Src1 = MI.getOperand(2); + + unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, + Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0, + &AMDGPU::SReg_32_XM0RegClass); + MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, + Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1, + &AMDGPU::SReg_32_XM0RegClass); + + MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, + Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0, + &AMDGPU::SReg_32_XM0RegClass); + MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, + Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1, + &AMDGPU::SReg_32_XM0RegClass); + + bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); + + unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; + unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; + BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) + .add(Src0Sub0) + .add(Src1Sub0); + BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) + .add(Src0Sub1) + .add(Src1Sub1); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + MI.eraseFromParent(); + return BB; + } + case AMDGPU::SI_INIT_M0: { BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) .add(MI.getOperand(0)); MI.eraseFromParent(); return BB; - + } case AMDGPU::SI_INIT_EXEC: // This should be before all vector instructions. BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), @@ -2212,7 +3097,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( case AMDGPU::SI_INDIRECT_DST_V8: case AMDGPU::SI_INDIRECT_DST_V16: return emitIndirectDst(MI, *BB, *getSubtarget()); - case AMDGPU::SI_KILL: + case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: + case AMDGPU::SI_KILL_I1_PSEUDO: return splitKillBlock(MI, BB); case AMDGPU::V_CNDMASK_B64_PSEUDO: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); @@ -2225,15 +3111,18 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy) + .addReg(SrcCond); BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo) .addReg(Src0, 0, AMDGPU::sub0) .addReg(Src1, 0, AMDGPU::sub0) - .addReg(SrcCond); + .addReg(SrcCondCopy); BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi) .addReg(Src0, 0, AMDGPU::sub1) .addReg(Src1, 0, AMDGPU::sub1) - .addReg(SrcCond); + .addReg(SrcCondCopy); BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst) .addReg(DstLo) @@ -2252,11 +3141,57 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MI.eraseFromParent(); return BB; } + case AMDGPU::ADJCALLSTACKUP: + case AMDGPU::ADJCALLSTACKDOWN: { + const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); + MachineInstrBuilder MIB(*MF, &MI); + MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine) + .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit); + return BB; + } + case AMDGPU::SI_CALL_ISEL: + case AMDGPU::SI_TCRETURN_ISEL: { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); + + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned GlobalAddrReg = MI.getOperand(0).getReg(); + MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg); + assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET); + + const GlobalValue *G = PCRel->getOperand(1).getGlobal(); + + MachineInstrBuilder MIB; + if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { + MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg) + .add(MI.getOperand(0)) + .addGlobalAddress(G); + } else { + MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN)) + .add(MI.getOperand(0)) + .addGlobalAddress(G); + + // There is an additional imm operand for tcreturn, but it should be in the + // right place already. + } + + for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) + MIB.add(MI.getOperand(I)); + + MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + MI.eraseFromParent(); + return BB; + } default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } } +bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const { + return isTypeLegal(VT.getScalarType()); +} + bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const { // This currently forces unfolding various combinations of fsub into fma with // free fneg'd operands. As long as we have fast FMA (controlled by @@ -2356,7 +3291,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::FP_ROUND: return lowerFP_ROUND(Op, DAG); - case ISD::TRAP: case ISD::DEBUGTRAP: return lowerTRAP(Op, DAG); @@ -2660,11 +3594,11 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { case SISubtarget::TrapIDLLVMTrap: return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain); case SISubtarget::TrapIDLLVMDebugTrap: { - DiagnosticInfoUnsupported NoTrap(*MF.getFunction(), + DiagnosticInfoUnsupported NoTrap(MF.getFunction(), "debugtrap handler not supported", Op.getDebugLoc(), DS_Warning); - LLVMContext &Ctx = MF.getFunction()->getContext(); + LLVMContext &Ctx = MF.getFunction().getContext(); Ctx.diagnose(NoTrap); return Chain; } @@ -2709,8 +3643,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, // private_segment_aperture_base_hi. uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44; - SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, QueuePtr, - DAG.getConstant(StructOffset, DL, MVT::i64)); + SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset); // TODO: Use custom target PseudoSourceValue. // TODO: We should use the value from the IR intrinsic call, but it might not @@ -2778,7 +3711,7 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, const MachineFunction &MF = DAG.getMachineFunction(); DiagnosticInfoUnsupported InvalidAddrSpaceCast( - *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc()); + MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc()); DAG.getContext()->diagnose(InvalidAddrSpaceCast); return DAG.getUNDEF(ASC->getValueType(0)); @@ -2917,13 +3850,16 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); + const GlobalValue *GV = GSD->getGlobal(); if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS && - GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS) + GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS && + // FIXME: It isn't correct to rely on the type of the pointer. This should + // be removed when address space 0 is 64-bit. + !GV->getType()->getElementType()->isFunctionTy()) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); SDLoc DL(GSD); - const GlobalValue *GV = GSD->getGlobal(); EVT PtrVT = Op.getValueType(); if (shouldEmitFixup(GV)) @@ -2977,7 +3913,7 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT) { - DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), + DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(), "non-hsa intrinsic with hsa target", DL.getDebugLoc()); DAG.getContext()->diagnose(BadIntrin); @@ -2986,7 +3922,7 @@ static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT) { - DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), + DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(), "intrinsic not supported on subtarget", DL.getDebugLoc()); DAG.getContext()->diagnose(BadIntrin); @@ -2997,7 +3933,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); auto MFI = MF.getInfo<SIMachineFunctionInfo>(); - const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); EVT VT = Op.getValueType(); SDLoc DL(Op); @@ -3009,38 +3944,35 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_implicit_buffer_ptr: { if (getSubtarget()->isAmdCodeObjectV2(MF)) return emitNonHSAIntrinsicError(DAG, DL, VT); - - unsigned Reg = TRI->getPreloadedValue(MF, - SIRegisterInfo::IMPLICIT_BUFFER_PTR); - return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); + return getPreloadedValue(DAG, *MFI, VT, + AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); } case Intrinsic::amdgcn_dispatch_ptr: case Intrinsic::amdgcn_queue_ptr: { if (!Subtarget->isAmdCodeObjectV2(MF)) { DiagnosticInfoUnsupported BadIntrin( - *MF.getFunction(), "unsupported hsa intrinsic without hsa target", + MF.getFunction(), "unsupported hsa intrinsic without hsa target", DL.getDebugLoc()); DAG.getContext()->diagnose(BadIntrin); return DAG.getUNDEF(VT); } - auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ? - SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR; - return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, - TRI->getPreloadedValue(MF, Reg), VT); + auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ? + AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR; + return getPreloadedValue(DAG, *MFI, VT, RegID); } case Intrinsic::amdgcn_implicitarg_ptr: { - unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); - return lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), offset); + if (MFI->isEntryFunction()) + return getImplicitArgPtr(DAG, DL); + return getPreloadedValue(DAG, *MFI, VT, + AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); } case Intrinsic::amdgcn_kernarg_segment_ptr: { - unsigned Reg - = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); - return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); + return getPreloadedValue(DAG, *MFI, VT, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); } case Intrinsic::amdgcn_dispatch_id: { - unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_ID); - return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); + return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID); } case Intrinsic::amdgcn_rcp: return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); @@ -3125,28 +4057,32 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SI::KernelInputOffsets::LOCAL_SIZE_Z); case Intrinsic::amdgcn_workgroup_id_x: case Intrinsic::r600_read_tgid_x: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); + return getPreloadedValue(DAG, *MFI, VT, + AMDGPUFunctionArgInfo::WORKGROUP_ID_X); case Intrinsic::amdgcn_workgroup_id_y: case Intrinsic::r600_read_tgid_y: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); + return getPreloadedValue(DAG, *MFI, VT, + AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); case Intrinsic::amdgcn_workgroup_id_z: case Intrinsic::r600_read_tgid_z: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); - case Intrinsic::amdgcn_workitem_id_x: + return getPreloadedValue(DAG, *MFI, VT, + AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); + case Intrinsic::amdgcn_workitem_id_x: { case Intrinsic::r600_read_tidig_x: - return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT); + return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, + SDLoc(DAG.getEntryNode()), + MFI->getArgInfo().WorkItemIDX); + } case Intrinsic::amdgcn_workitem_id_y: case Intrinsic::r600_read_tidig_y: - return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT); + return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, + SDLoc(DAG.getEntryNode()), + MFI->getArgInfo().WorkItemIDY); case Intrinsic::amdgcn_workitem_id_z: case Intrinsic::r600_read_tidig_z: - return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); + return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, + SDLoc(DAG.getEntryNode()), + MFI->getArgInfo().WorkItemIDZ); case AMDGPUIntrinsic::SI_load_const: { SDValue Ops[] = { Op.getOperand(1), @@ -3193,7 +4129,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return SDValue(); DiagnosticInfoUnsupported BadIntrin( - *MF.getFunction(), "intrinsic not supported on subtarget", + MF.getFunction(), "intrinsic not supported on subtarget", DL.getDebugLoc()); DAG.getContext()->diagnose(BadIntrin); return DAG.getUNDEF(VT); @@ -3224,7 +4160,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // 3rd parameter required to be a constant. const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3)); if (!Param) - return DAG.getUNDEF(VT); + return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL); // Translate to the operands expected by the machine instruction. The // first parameter must be the same as the first instruction. @@ -3292,6 +4228,26 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), Op.getOperand(2)); return DAG.getNode(ISD::BITCAST, DL, VT, Node); } + case Intrinsic::amdgcn_wqm: { + SDValue Src = Op.getOperand(1); + return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src), + 0); + } + case Intrinsic::amdgcn_wwm: { + SDValue Src = Op.getOperand(1); + return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src), + 0); + } + case Intrinsic::amdgcn_image_getlod: + case Intrinsic::amdgcn_image_getresinfo: { + unsigned Idx = (IntrinsicID == Intrinsic::amdgcn_image_getresinfo) ? 3 : 4; + + // Replace dmask with everything disabled with undef. + const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(Idx)); + if (!DMask || DMask->isNullValue()) + return DAG.getUNDEF(Op.getValueType()); + return SDValue(); + } default: return Op; } @@ -3365,6 +4321,95 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, Op->getVTList(), Ops, VT, MMO); } + case Intrinsic::amdgcn_buffer_atomic_swap: + case Intrinsic::amdgcn_buffer_atomic_add: + case Intrinsic::amdgcn_buffer_atomic_sub: + case Intrinsic::amdgcn_buffer_atomic_smin: + case Intrinsic::amdgcn_buffer_atomic_umin: + case Intrinsic::amdgcn_buffer_atomic_smax: + case Intrinsic::amdgcn_buffer_atomic_umax: + case Intrinsic::amdgcn_buffer_atomic_and: + case Intrinsic::amdgcn_buffer_atomic_or: + case Intrinsic::amdgcn_buffer_atomic_xor: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + Op.getOperand(5), // offset + Op.getOperand(6) // slc + }; + EVT VT = Op.getOperand(3).getValueType(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOVolatile, + VT.getStoreSize(), 4); + unsigned Opcode = 0; + + switch (IntrID) { + case Intrinsic::amdgcn_buffer_atomic_swap: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP; + break; + case Intrinsic::amdgcn_buffer_atomic_add: + Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD; + break; + case Intrinsic::amdgcn_buffer_atomic_sub: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB; + break; + case Intrinsic::amdgcn_buffer_atomic_smin: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN; + break; + case Intrinsic::amdgcn_buffer_atomic_umin: + Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN; + break; + case Intrinsic::amdgcn_buffer_atomic_smax: + Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX; + break; + case Intrinsic::amdgcn_buffer_atomic_umax: + Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX; + break; + case Intrinsic::amdgcn_buffer_atomic_and: + Opcode = AMDGPUISD::BUFFER_ATOMIC_AND; + break; + case Intrinsic::amdgcn_buffer_atomic_or: + Opcode = AMDGPUISD::BUFFER_ATOMIC_OR; + break; + case Intrinsic::amdgcn_buffer_atomic_xor: + Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; + break; + default: + llvm_unreachable("unhandled atomic opcode"); + } + + return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO); + } + + case Intrinsic::amdgcn_buffer_atomic_cmpswap: { + SDValue Ops[] = { + Op.getOperand(0), // Chain + Op.getOperand(2), // src + Op.getOperand(3), // cmp + Op.getOperand(4), // rsrc + Op.getOperand(5), // vindex + Op.getOperand(6), // offset + Op.getOperand(7) // slc + }; + EVT VT = Op.getOperand(4).getValueType(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOVolatile, + VT.getStoreSize(), 4); + + return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, + Op->getVTList(), Ops, VT, MMO); + } + // Basic sample. case Intrinsic::amdgcn_image_sample: case Intrinsic::amdgcn_image_sample_cl: @@ -3411,9 +4456,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_image_sample_c_b_cl_o: case Intrinsic::amdgcn_image_sample_c_lz_o: case Intrinsic::amdgcn_image_sample_c_cd_o: - case Intrinsic::amdgcn_image_sample_c_cd_cl_o: - - case Intrinsic::amdgcn_image_getlod: { + case Intrinsic::amdgcn_image_sample_c_cd_cl_o: { // Replace dmask with everything disabled with undef. const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5)); if (!DMask || DMask->isNullValue()) { @@ -3516,7 +4559,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, case Intrinsic::amdgcn_s_barrier: { if (getTargetMachine().getOptLevel() > CodeGenOpt::None) { const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second; + unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second; if (WGSize <= ST.getWavefrontSize()) return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other, Op.getOperand(0)), 0); @@ -3592,6 +4635,30 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op->getVTList(), Ops, VT, MMO); } + case Intrinsic::amdgcn_buffer_store: + case Intrinsic::amdgcn_buffer_store_format: { + SDValue Ops[] = { + Chain, + Op.getOperand(2), // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + Op.getOperand(5), // offset + Op.getOperand(6), // glc + Op.getOperand(7) // slc + }; + EVT VT = Op.getOperand(3).getValueType(); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable, + VT.getStoreSize(), 4); + + unsigned Opcode = IntrinsicID == Intrinsic::amdgcn_buffer_store ? + AMDGPUISD::BUFFER_STORE : + AMDGPUISD::BUFFER_STORE_FORMAT; + return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO); + } + default: return Op; } @@ -3604,6 +4671,9 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { EVT MemVT = Load->getMemoryVT(); if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { + if (MemVT == MVT::i16 && isTypeLegal(MVT::i16)) + return SDValue(); + // FIXME: Copied from PPC // First, load into 32 bits, then truncate to 1 bit. @@ -4187,32 +5257,6 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, return SDValue(); } -/// \brief Return true if the given offset Size in bytes can be folded into -/// the immediate offsets of a memory instruction for the given address space. -static bool canFoldOffset(unsigned OffsetSize, unsigned AS, - const SISubtarget &STI) { - auto AMDGPUASI = STI.getAMDGPUAS(); - if (AS == AMDGPUASI.GLOBAL_ADDRESS) { - // MUBUF instructions a 12-bit offset in bytes. - return isUInt<12>(OffsetSize); - } - if (AS == AMDGPUASI.CONSTANT_ADDRESS) { - // SMRD instructions have an 8-bit offset in dwords on SI and - // a 20-bit offset in bytes on VI. - if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) - return isUInt<20>(OffsetSize); - else - return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); - } - if (AS == AMDGPUASI.LOCAL_ADDRESS || - AS == AMDGPUASI.REGION_ADDRESS) { - // The single offset versions have a 16-bit offset in bytes. - return isUInt<16>(OffsetSize); - } - // Indirect register addressing does not use any offsets. - return false; -} - // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) // This is a variant of @@ -4229,11 +5273,15 @@ static bool canFoldOffset(unsigned OffsetSize, unsigned AS, // SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace, + EVT MemVT, DAGCombinerInfo &DCI) const { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - if (N0.getOpcode() != ISD::ADD) + // We only do this to handle cases where it's profitable when there are + // multiple uses of the add, so defer to the standard combine. + if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) || + N0->hasOneUse()) return SDValue(); const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); @@ -4247,7 +5295,12 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, // If the resulting offset is too large, we can't fold it into the addressing // mode offset. APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); - if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget())) + Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext()); + + AddrMode AM; + AM.HasBaseReg = true; + AM.BaseOffs = Offset.getSExtValue(); + if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace)) return SDValue(); SelectionDAG &DAG = DCI.DAG; @@ -4257,7 +5310,12 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32); - return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); + SDNodeFlags Flags; + Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() && + (N0.getOpcode() == ISD::OR || + N0->getFlags().hasNoUnsignedWrap())); + + return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags); } SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N, @@ -4267,9 +5325,9 @@ SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N, SDLoc SL(N); // TODO: We could also do this for multiplies. - unsigned AS = N->getAddressSpace(); - if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUASI.PRIVATE_ADDRESS) { - SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); + if (Ptr.getOpcode() == ISD::SHL) { + SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(), + N->getMemoryVT(), DCI); if (NewPtr) { SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end()); @@ -4818,15 +5876,27 @@ SDValue SITargetLowering::performIntMed3ImmCombine( return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3); } +static ConstantFPSDNode *getSplatConstantFP(SDValue Op) { + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) + return C; + + if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) { + if (ConstantFPSDNode *C = BV->getConstantFPSplatNode()) + return C; + } + + return nullptr; +} + SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, SDValue Op0, SDValue Op1) const { - ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1); + ConstantFPSDNode *K1 = getSplatConstantFP(Op1); if (!K1) return SDValue(); - ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1)); + ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1)); if (!K0) return SDValue(); @@ -4836,7 +5906,7 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, return SDValue(); // TODO: Check IEEE bit enabled? - EVT VT = K0->getValueType(0); + EVT VT = Op0.getValueType(); if (Subtarget->enableDX10Clamp()) { // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the // hardware fmed3 behavior converting to a min. @@ -4845,19 +5915,21 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0)); } - // med3 for f16 is only available on gfx9+. - if (VT == MVT::f64 || (VT == MVT::f16 && !Subtarget->hasMed3_16())) - return SDValue(); + // med3 for f16 is only available on gfx9+, and not available for v2f16. + if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) { + // This isn't safe with signaling NaNs because in IEEE mode, min/max on a + // signaling NaN gives a quiet NaN. The quiet NaN input to the min would + // then give the other result, which is different from med3 with a NaN + // input. + SDValue Var = Op0.getOperand(0); + if (!isKnownNeverSNan(DAG, Var)) + return SDValue(); - // This isn't safe with signaling NaNs because in IEEE mode, min/max on a - // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then - // give the other result, which is different from med3 with a NaN input. - SDValue Var = Op0.getOperand(0); - if (!isKnownNeverSNan(DAG, Var)) - return SDValue(); + return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), + Var, SDValue(K0, 0), SDValue(K1, 0)); + } - return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), - Var, SDValue(K0, 0), SDValue(K1, 0)); + return SDValue(); } SDValue SITargetLowering::performMinMaxCombine(SDNode *N, @@ -4918,7 +5990,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, (Opc == AMDGPUISD::FMIN_LEGACY && Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && (VT == MVT::f32 || VT == MVT::f64 || - (VT == MVT::f16 && Subtarget->has16BitInsts())) && + (VT == MVT::f16 && Subtarget->has16BitInsts()) || + (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) && Op0.hasOneUse()) { if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) return Res; @@ -4994,7 +6067,7 @@ SDValue SITargetLowering::performExtractVectorEltCombine( SDNode *N, DAGCombinerInfo &DCI) const { SDValue Vec = N->getOperand(0); - SelectionDAG &DAG= DCI.DAG; + SelectionDAG &DAG = DCI.DAG; if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) { SDLoc SL(N); EVT EltVT = N->getValueType(0); @@ -5007,6 +6080,47 @@ SDValue SITargetLowering::performExtractVectorEltCombine( return SDValue(); } +static bool convertBuildVectorCastElt(SelectionDAG &DAG, + SDValue &Lo, SDValue &Hi) { + if (Hi.getOpcode() == ISD::BITCAST && + Hi.getOperand(0).getValueType() == MVT::f16 && + (isa<ConstantSDNode>(Lo) || Lo.isUndef())) { + Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo); + Hi = Hi.getOperand(0); + return true; + } + + return false; +} + +SDValue SITargetLowering::performBuildVectorCombine( + SDNode *N, DAGCombinerInfo &DCI) const { + SDLoc SL(N); + + if (!isTypeLegal(MVT::v2i16)) + return SDValue(); + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + if (VT == MVT::v2i16) { + SDValue Lo = N->getOperand(0); + SDValue Hi = N->getOperand(1); + + // v2i16 build_vector (const|undef), (bitcast f16:$x) + // -> bitcast (v2f16 build_vector const|undef, $x + if (convertBuildVectorCastElt(DAG, Lo, Hi)) { + SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi }); + return DAG.getNode(ISD::BITCAST, SL, VT, NewVec); + } + + if (convertBuildVectorCastElt(DAG, Hi, Lo)) { + SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo }); + return DAG.getNode(ISD::BITCAST, SL, VT, NewVec); + } + } + + return SDValue(); +} unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, @@ -5030,18 +6144,57 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, return 0; } +static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, + EVT VT, + SDValue N0, SDValue N1, SDValue N2, + bool Signed) { + unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32; + SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1); + SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2); + return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad); +} + SDValue SITargetLowering::performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); - - if (VT != MVT::i32) - return SDValue(); - SDLoc SL(N); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); + if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) + && Subtarget->hasMad64_32() && + !VT.isVector() && VT.getScalarSizeInBits() > 32 && + VT.getScalarSizeInBits() <= 64) { + if (LHS.getOpcode() != ISD::MUL) + std::swap(LHS, RHS); + + SDValue MulLHS = LHS.getOperand(0); + SDValue MulRHS = LHS.getOperand(1); + SDValue AddRHS = RHS; + + // TODO: Maybe restrict if SGPR inputs. + if (numBitsUnsigned(MulLHS, DAG) <= 32 && + numBitsUnsigned(MulRHS, DAG) <= 32) { + MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32); + MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32); + AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64); + return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false); + } + + if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) { + MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32); + MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32); + AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64); + return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true); + } + + return SDValue(); + } + + if (VT != MVT::i32) + return SDValue(); + // add x, zext (setcc) => addcarry x, 0, setcc // add x, sext (setcc) => subcarry x, 0, setcc unsigned Opc = LHS.getOpcode(); @@ -5428,6 +6581,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, } case ISD::EXTRACT_VECTOR_ELT: return performExtractVectorEltCombine(N, DCI); + case ISD::BUILD_VECTOR: + return performBuildVectorCombine(N, DCI); } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } @@ -5444,13 +6599,19 @@ static unsigned SubIdx2Lane(unsigned Idx) { } /// \brief Adjust the writemask of MIMG instructions -void SITargetLowering::adjustWritemask(MachineSDNode *&Node, - SelectionDAG &DAG) const { - SDNode *Users[4] = { }; +SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, + SelectionDAG &DAG) const { + SDNode *Users[4] = { nullptr }; unsigned Lane = 0; unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3; unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); unsigned NewDmask = 0; + bool HasChain = Node->getNumValues() > 1; + + if (OldDmask == 0) { + // These are folded out, but on the chance it happens don't assert. + return Node; + } // Try to figure out the used register components for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); @@ -5463,9 +6624,9 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, // Abort if we can't understand the usage if (!I->isMachineOpcode() || I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) - return; + return Node; - // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. + // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used. // Note that subregs are packed, i.e. Lane==0 is the first bit set // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit // set, etc. @@ -5474,14 +6635,13 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, // Set which texture component corresponds to the lane. unsigned Comp; for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { - assert(Dmask); Comp = countTrailingZeros(Dmask); Dmask &= ~(1 << Comp); } // Abort if we have more than one user per component if (Users[Lane]) - return; + return Node; Users[Lane] = *I; NewDmask |= 1 << Comp; @@ -5489,25 +6649,47 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, // Abort if there's no change if (NewDmask == OldDmask) - return; + return Node; + + unsigned BitsSet = countPopulation(NewDmask); + + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + int NewOpcode = AMDGPU::getMaskedMIMGOp(*TII, + Node->getMachineOpcode(), BitsSet); + assert(NewOpcode != -1 && + NewOpcode != static_cast<int>(Node->getMachineOpcode()) && + "failed to find equivalent MIMG op"); // Adjust the writemask in the node - std::vector<SDValue> Ops; + SmallVector<SDValue, 12> Ops; Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx); Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end()); - Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); - - // If we only got one lane, replace it with a copy - // (if NewDmask has only one bit set...) - if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { - SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), - MVT::i32); - SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, - SDLoc(), Users[Lane]->getValueType(0), - SDValue(Node, 0), RC); + + MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT(); + + MVT ResultVT = BitsSet == 1 ? + SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet); + SDVTList NewVTList = HasChain ? + DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT); + + + MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node), + NewVTList, Ops); + + if (HasChain) { + // Update chain. + NewNode->setMemRefs(Node->memoperands_begin(), Node->memoperands_end()); + DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1)); + } + + if (BitsSet == 1) { + assert(Node->hasNUsesOfValue(1, 0)); + SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY, + SDLoc(Node), Users[Lane]->getValueType(0), + SDValue(NewNode, 0)); DAG.ReplaceAllUsesWith(Users[Lane], Copy); - return; + return nullptr; } // Update the users of the node with the new indices @@ -5517,7 +6699,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, continue; SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); - DAG.UpdateNodeOperands(User, User->getOperand(0), Op); + DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); switch (Idx) { default: break; @@ -5526,6 +6708,9 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, case AMDGPU::sub2: Idx = AMDGPU::sub3; break; } } + + DAG.RemoveDeadNode(Node); + return nullptr; } static bool isFrameIndexOp(SDValue Op) { @@ -5579,25 +6764,80 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, Node->getOperand(i)), 0)); } - DAG.UpdateNodeOperands(Node, Ops); - return Node; + return DAG.UpdateNodeOperands(Node, Ops); } /// \brief Fold the instructions after selecting them. +/// Returns null if users were already updated. SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); unsigned Opcode = Node->getMachineOpcode(); if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() && - !TII->isGather4(Opcode)) - adjustWritemask(Node, DAG); + !TII->isGather4(Opcode)) { + return adjustWritemask(Node, DAG); + } if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) { legalizeTargetIndependentNode(Node, DAG); return Node; } + + switch (Opcode) { + case AMDGPU::V_DIV_SCALE_F32: + case AMDGPU::V_DIV_SCALE_F64: { + // Satisfy the operand register constraint when one of the inputs is + // undefined. Ordinarily each undef value will have its own implicit_def of + // a vreg, so force these to use a single register. + SDValue Src0 = Node->getOperand(0); + SDValue Src1 = Node->getOperand(1); + SDValue Src2 = Node->getOperand(2); + + if ((Src0.isMachineOpcode() && + Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) && + (Src0 == Src1 || Src0 == Src2)) + break; + + MVT VT = Src0.getValueType().getSimpleVT(); + const TargetRegisterClass *RC = getRegClassFor(VT); + + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT); + + SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), + UndefReg, Src0, SDValue()); + + // src0 must be the same register as src1 or src2, even if the value is + // undefined, so make sure we don't violate this constraint. + if (Src0.isMachineOpcode() && + Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) { + if (Src1.isMachineOpcode() && + Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) + Src0 = Src1; + else if (Src2.isMachineOpcode() && + Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) + Src0 = Src2; + else { + assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF); + Src0 = UndefReg; + Src1 = UndefReg; + } + } else + break; + + SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 }; + for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I) + Ops.push_back(Node->getOperand(I)); + + Ops.push_back(ImpDef.getValue(1)); + return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); + } + default: + break; + } + return Node; } @@ -5615,31 +6855,6 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, return; } - if (TII->isMIMG(MI)) { - unsigned VReg = MI.getOperand(0).getReg(); - const TargetRegisterClass *RC = MRI.getRegClass(VReg); - // TODO: Need mapping tables to handle other cases (register classes). - if (RC != &AMDGPU::VReg_128RegClass) - return; - - unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4; - unsigned Writemask = MI.getOperand(DmaskIdx).getImm(); - unsigned BitsSet = 0; - for (unsigned i = 0; i < 4; ++i) - BitsSet += Writemask & (1 << i) ? 1 : 0; - switch (BitsSet) { - default: return; - case 1: RC = &AMDGPU::VGPR_32RegClass; break; - case 2: RC = &AMDGPU::VReg_64RegClass; break; - case 3: RC = &AMDGPU::VReg_96RegClass; break; - } - - unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet); - MI.setDesc(TII->get(NewOpcode)); - MRI.setRegClass(VReg, RC); - return; - } - // Replace unused atomics with the no return version. int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode()); if (NoRetAtomicOp != -1) { @@ -5870,3 +7085,21 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { TargetLoweringBase::finalizeLowering(MF); } + +void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, + KnownBits &Known, + const APInt &DemandedElts, + const SelectionDAG &DAG, + unsigned Depth) const { + TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts, + DAG, Depth); + + if (getSubtarget()->enableHugePrivateBuffer()) + return; + + // Technically it may be possible to have a dispatch with a single workitem + // that uses the full private memory size, but that's not really useful. We + // can't use vaddr in MUBUF instructions if we don't know the address + // calculation won't overflow, so assume the sign bit is never set. + Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits); +} diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index e6bb3d6cd419..b48e67f7563a 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -16,6 +16,7 @@ #define LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H #include "AMDGPUISelLowering.h" +#include "AMDGPUArgumentUsageInfo.h" #include "SIInstrInfo.h" namespace llvm { @@ -23,6 +24,7 @@ namespace llvm { class SITargetLowering final : public AMDGPUTargetLowering { SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, uint64_t Offset) const; + SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const; SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, uint64_t Offset, bool Signed, @@ -31,6 +33,10 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, const SDLoc &SL, SDValue Chain, const ISD::InputArg &Arg) const; + SDValue getPreloadedValue(SelectionDAG &DAG, + const SIMachineFunctionInfo &MFI, + EVT VT, + AMDGPUFunctionArgInfo::PreloadedValue) const; SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; @@ -76,12 +82,13 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const; - void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; + SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; SDValue performUCharToFloatCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSHLPtrCombine(SDNode *N, unsigned AS, + EVT MemVT, DAGCombinerInfo &DCI) const; SDValue performMemSDNodeCombine(MemSDNode *N, DAGCombinerInfo &DCI) const; @@ -105,6 +112,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performBuildVectorCombine(SDNode *N, DAGCombinerInfo &DCI) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; @@ -117,6 +125,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const; bool isLegalFlatAddressingMode(const AddrMode &AM) const; + bool isLegalGlobalAddressingMode(const AddrMode &AM) const; bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; unsigned isCFIntrinsic(const SDNode *Intr) const; @@ -140,10 +149,10 @@ public: const SISubtarget *getSubtarget() const; - bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/, - EVT /*VT*/) const override; + bool isShuffleMaskLegal(ArrayRef<int> /*Mask*/, EVT /*VT*/) const override; bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, + MachineFunction &MF, unsigned IntrinsicID) const override; bool getAddrModeArguments(IntrinsicInst * /*I*/, @@ -151,7 +160,8 @@ public: Type *&/*AccessTy*/) const override; bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, - unsigned AS) const override; + unsigned AS, + Instruction *I = nullptr) const override; bool canMergeStoresTo(unsigned AS, EVT MemVT, const SelectionDAG &DAG) const override; @@ -181,6 +191,12 @@ public: bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; + bool supportSplitCSR(MachineFunction *MF) const override; + void initializeSplitCSR(MachineBasicBlock *Entry) const override; + void insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl<MachineBasicBlock *> &Exits) const override; + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, @@ -197,6 +213,32 @@ public: const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; + void passSpecialInputs( + CallLoweringInfo &CLI, + const SIMachineFunctionInfo &Info, + SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, + SmallVectorImpl<SDValue> &MemOpChains, + SDValue Chain, + SDValue StackPtr) const; + + SDValue LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + const SDLoc &DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals, bool isThisReturn, + SDValue ThisVal) const; + + bool mayBeEmittedAsTailCall(const CallInst *) const override; + + bool isEligibleForTailCallOptimization( + SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const; + + SDValue LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const override; + unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; @@ -206,6 +248,8 @@ public: MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override; + + bool hasBitPreservingFPLogic(EVT VT) const override; bool enableAggressiveFMAFusion(EVT VT) const override; EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; @@ -234,6 +278,12 @@ public: SDValue V) const; void finalizeLowering(MachineFunction &MF) const override; + + void computeKnownBitsForFrameIndex(const SDValue Op, + KnownBits &Known, + const APInt &DemandedElts, + const SelectionDAG &DAG, + unsigned Depth = 0) const override; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp index ba346d2fad02..a2f844d7854e 100644 --- a/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -132,6 +132,16 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From, I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) return true; + // V_READFIRSTLANE/V_READLANE destination register may be used as operand + // by some SALU instruction. If exec mask is zero vector instruction + // defining the register that is used by the scalar one is not executed + // and scalar instruction will operate on undefined data. For + // V_READFIRSTLANE/V_READLANE we should avoid predicated execution. + if ((I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) || + (I->getOpcode() == AMDGPU::V_READLANE_B32)) { + return true; + } + if (I->isInlineAsm()) { const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); const char *AsmStr = I->getOperand(0).getSymbolName(); @@ -156,7 +166,7 @@ bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { MachineBasicBlock &MBB = *MI.getParent(); MachineFunction *MF = MBB.getParent(); - if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || + if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS || !shouldSkip(MBB, MBB.getParent()->back())) return false; @@ -190,25 +200,101 @@ bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { void SIInsertSkips::kill(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); - const MachineOperand &Op = MI.getOperand(0); - -#ifndef NDEBUG - CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv(); - // Kill is only allowed in pixel / geometry shaders. - assert(CallConv == CallingConv::AMDGPU_PS || - CallConv == CallingConv::AMDGPU_GS); -#endif - // Clear this thread from the exec mask if the operand is negative. - if (Op.isImm()) { - // Constant operand: Set exec mask to 0 or do nothing - if (Op.getImm() & 0x80000000) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addImm(0); + + switch (MI.getOpcode()) { + case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: { + unsigned Opcode = 0; + + // The opcodes are inverted because the inline immediate has to be + // the first operand, e.g. from "x < imm" to "imm > x" + switch (MI.getOperand(2).getImm()) { + case ISD::SETOEQ: + case ISD::SETEQ: + Opcode = AMDGPU::V_CMPX_EQ_F32_e32; + break; + case ISD::SETOGT: + case ISD::SETGT: + Opcode = AMDGPU::V_CMPX_LT_F32_e32; + break; + case ISD::SETOGE: + case ISD::SETGE: + Opcode = AMDGPU::V_CMPX_LE_F32_e32; + break; + case ISD::SETOLT: + case ISD::SETLT: + Opcode = AMDGPU::V_CMPX_GT_F32_e32; + break; + case ISD::SETOLE: + case ISD::SETLE: + Opcode = AMDGPU::V_CMPX_GE_F32_e32; + break; + case ISD::SETONE: + case ISD::SETNE: + Opcode = AMDGPU::V_CMPX_LG_F32_e32; + break; + case ISD::SETO: + Opcode = AMDGPU::V_CMPX_O_F32_e32; + break; + case ISD::SETUO: + Opcode = AMDGPU::V_CMPX_U_F32_e32; + break; + case ISD::SETUEQ: + Opcode = AMDGPU::V_CMPX_NLG_F32_e32; + break; + case ISD::SETUGT: + Opcode = AMDGPU::V_CMPX_NGE_F32_e32; + break; + case ISD::SETUGE: + Opcode = AMDGPU::V_CMPX_NGT_F32_e32; + break; + case ISD::SETULT: + Opcode = AMDGPU::V_CMPX_NLE_F32_e32; + break; + case ISD::SETULE: + Opcode = AMDGPU::V_CMPX_NLT_F32_e32; + break; + case ISD::SETUNE: + Opcode = AMDGPU::V_CMPX_NEQ_F32_e32; + break; + default: + llvm_unreachable("invalid ISD:SET cond code"); } - } else { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) - .addImm(0) + + // TODO: Allow this: + if (!MI.getOperand(0).isReg() || + !TRI->isVGPR(MBB.getParent()->getRegInfo(), + MI.getOperand(0).getReg())) + llvm_unreachable("SI_KILL operand should be a VGPR"); + + BuildMI(MBB, &MI, DL, TII->get(Opcode)) + .add(MI.getOperand(1)) + .add(MI.getOperand(0)); + break; + } + case AMDGPU::SI_KILL_I1_TERMINATOR: { + const MachineOperand &Op = MI.getOperand(0); + int64_t KillVal = MI.getOperand(1).getImm(); + assert(KillVal == 0 || KillVal == -1); + + // Kill all threads if Op0 is an immediate and equal to the Kill value. + if (Op.isImm()) { + int64_t Imm = Op.getImm(); + assert(Imm == 0 || Imm == -1); + + if (Imm == KillVal) + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addImm(0); + break; + } + + unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64; + BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) .add(Op); + break; + } + default: + llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR"); } } @@ -301,7 +387,8 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { } break; - case AMDGPU::SI_KILL_TERMINATOR: + case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: + case AMDGPU::SI_KILL_I1_TERMINATOR: MadeChange = true; kill(MI); diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 0f009a48754a..6bbe5979316d 100644 --- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1,4 +1,4 @@ -//===-- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===/ +//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===// // // The LLVM Compiler Infrastructure // @@ -21,12 +21,34 @@ #include "SIDefines.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" #include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <cstring> +#include <memory> +#include <utility> +#include <vector> #define DEBUG_TYPE "si-insert-waitcnts" @@ -42,7 +64,7 @@ namespace { enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS }; -typedef std::pair<signed, signed> RegInterval; +using RegInterval = std::pair<signed, signed>; struct { int32_t VmcntMax; @@ -101,6 +123,15 @@ enum RegisterMapping { // "s_waitcnt 0" before use. class BlockWaitcntBrackets { public: + BlockWaitcntBrackets() { + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + memset(VgprScores[T], 0, sizeof(VgprScores[T])); + } + } + + ~BlockWaitcntBrackets() = default; + static int32_t getWaitCountMax(InstCounterType T) { switch (T) { case VM_CNT: @@ -113,14 +144,14 @@ public: break; } return 0; - }; + } void setScoreLB(InstCounterType T, int32_t Val) { assert(T < NUM_INST_CNTS); if (T >= NUM_INST_CNTS) return; ScoreLBs[T] = Val; - }; + } void setScoreUB(InstCounterType T, int32_t Val) { assert(T < NUM_INST_CNTS); @@ -132,21 +163,21 @@ public: if (ScoreLBs[T] < UB) ScoreLBs[T] = UB; } - }; + } int32_t getScoreLB(InstCounterType T) { assert(T < NUM_INST_CNTS); if (T >= NUM_INST_CNTS) return 0; return ScoreLBs[T]; - }; + } int32_t getScoreUB(InstCounterType T) { assert(T < NUM_INST_CNTS); if (T >= NUM_INST_CNTS) return 0; return ScoreUBs[T]; - }; + } // Mapping from event to counter. InstCounterType eventCounter(WaitEventType E) { @@ -218,26 +249,18 @@ public: void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; } int32_t getMaxVGPR() const { return VgprUB; } int32_t getMaxSGPR() const { return SgprUB; } + int32_t getEventUB(enum WaitEventType W) const { assert(W < NUM_WAIT_EVENTS); return EventUBs[W]; } + bool counterOutOfOrder(InstCounterType T); unsigned int updateByWait(InstCounterType T, int ScoreToWait); void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, WaitEventType E, MachineInstr &MI); - BlockWaitcntBrackets() - : WaitAtBeginning(false), RevisitLoop(false), ValidLoop(false), MixedExpTypes(false), - LoopRegion(NULL), PostOrder(0), Waitcnt(NULL), VgprUB(0), SgprUB(0) { - for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; - T = (enum InstCounterType)(T + 1)) { - memset(VgprScores[T], 0, sizeof(VgprScores[T])); - } - } - ~BlockWaitcntBrackets(){}; - bool hasPendingSMEM() const { return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] && EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]); @@ -266,7 +289,7 @@ public: int32_t getPostOrder() const { return PostOrder; } void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; } - void clearWaitcnt() { Waitcnt = NULL; } + void clearWaitcnt() { Waitcnt = nullptr; } MachineInstr *getWaitcnt() const { return Waitcnt; } bool mixedExpTypes() const { return MixedExpTypes; } @@ -278,13 +301,11 @@ public: void dump() { print(dbgs()); } private: - bool WaitAtBeginning; - bool RevisitLoop; - bool ValidLoop; - bool MixedExpTypes; - MachineLoop *LoopRegion; - int32_t PostOrder; - MachineInstr *Waitcnt; + bool WaitAtBeginning = false; + bool RevisitLoop = false; + bool MixedExpTypes = false; + int32_t PostOrder = 0; + MachineInstr *Waitcnt = nullptr; int32_t ScoreLBs[NUM_INST_CNTS] = {0}; int32_t ScoreUBs[NUM_INST_CNTS] = {0}; int32_t EventUBs[NUM_WAIT_EVENTS] = {0}; @@ -292,8 +313,8 @@ private: int32_t LastFlat[NUM_INST_CNTS] = {0}; // wait_cnt scores for every vgpr. // Keep track of the VgprUB and SgprUB to make merge at join efficient. - int32_t VgprUB; - int32_t SgprUB; + int32_t VgprUB = 0; + int32_t SgprUB = 0; int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS]; // Wait cnt scores for every sgpr, only lgkmcnt is relevant. int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0}; @@ -306,38 +327,36 @@ private: // at the end of the loop footer. class LoopWaitcntData { public: + LoopWaitcntData() = default; + ~LoopWaitcntData() = default; + void incIterCnt() { IterCnt++; } void resetIterCnt() { IterCnt = 0; } int32_t getIterCnt() { return IterCnt; } - LoopWaitcntData() : LfWaitcnt(NULL), IterCnt(0) {} - ~LoopWaitcntData(){}; - void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; } MachineInstr *getWaitcnt() const { return LfWaitcnt; } void print() { DEBUG(dbgs() << " iteration " << IterCnt << '\n';); - return; } private: // s_waitcnt added at the end of loop footer to stablize wait scores // at the end of the loop footer. - MachineInstr *LfWaitcnt; + MachineInstr *LfWaitcnt = nullptr; // Number of iterations the loop has been visited, not including the initial // walk over. - int32_t IterCnt; + int32_t IterCnt = 0; }; class SIInsertWaitcnts : public MachineFunctionPass { - private: - const SISubtarget *ST; - const SIInstrInfo *TII; - const SIRegisterInfo *TRI; - const MachineRegisterInfo *MRI; - const MachineLoopInfo *MLI; + const SISubtarget *ST = nullptr; + const SIInstrInfo *TII = nullptr; + const SIRegisterInfo *TRI = nullptr; + const MachineRegisterInfo *MRI = nullptr; + const MachineLoopInfo *MLI = nullptr; AMDGPU::IsaInfo::IsaVersion IV; AMDGPUAS AMDGPUASI; @@ -357,9 +376,7 @@ private: public: static char ID; - SIInsertWaitcnts() - : MachineFunctionPass(ID), ST(nullptr), TII(nullptr), TRI(nullptr), - MRI(nullptr), MLI(nullptr) {} + SIInsertWaitcnts() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -376,9 +393,11 @@ public: void addKillWaitBracket(BlockWaitcntBrackets *Bracket) { // The waitcnt information is copied because it changes as the block is // traversed. - KillWaitBrackets.push_back(make_unique<BlockWaitcntBrackets>(*Bracket)); + KillWaitBrackets.push_back( + llvm::make_unique<BlockWaitcntBrackets>(*Bracket)); } + bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets); void updateEventWaitCntAfter(MachineInstr &Inst, @@ -389,7 +408,7 @@ public: void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst); }; -} // End anonymous namespace. +} // end anonymous namespace RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII, @@ -567,13 +586,13 @@ void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII, } #if 0 // TODO: check if this is handled by MUBUF code above. } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD || - Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 || - Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) { + Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 || + Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) { MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data); unsigned OpNo;//TODO: find the OpNo for this operand; RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false); for (signed RegNo = Interval.first; RegNo < Interval.second; - ++RegNo) { + ++RegNo) { setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore); } #endif @@ -642,7 +661,6 @@ void BlockWaitcntBrackets::print(raw_ostream &OS) { OS << '\n'; } OS << '\n'; - return; } unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T, @@ -860,7 +878,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( switch (src_type) { case SCMEM_LDS: if (group_is_multi_wave || - context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) { + context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) { EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT)); // LDS may have to wait for VM_CNT after buffer load to LDS @@ -874,9 +892,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( case SCMEM_GDS: if (group_is_multi_wave || fence_is_global) { EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, - ScoreBrackets->getScoreUB(EXP_CNT)); + ScoreBrackets->getScoreUB(EXP_CNT)); EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, - ScoreBrackets->getScoreUB(LGKM_CNT)); + ScoreBrackets->getScoreUB(LGKM_CNT)); } break; @@ -886,9 +904,9 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( case SCMEM_SCATTER: if (group_is_multi_wave || fence_is_global) { EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, - ScoreBrackets->getScoreUB(EXP_CNT)); + ScoreBrackets->getScoreUB(EXP_CNT)); EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT, - ScoreBrackets->getScoreUB(VM_CNT)); + ScoreBrackets->getScoreUB(VM_CNT)); } break; @@ -927,13 +945,14 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( // before the call. if (MI.getOpcode() == SC_CALL) { if (ScoreBrackets->getScoreUB(EXP_CNT) > - ScoreBrackets->getScoreLB(EXP_CNT)) { + ScoreBrackets->getScoreLB(EXP_CNT)) { ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); EmitSwaitcnt |= CNT_MASK(EXP_CNT); } } #endif + // FIXME: Should not be relying on memoperands. // Look at the source operands of every instruction to see if // any of them results from a previous memory operation that affects // its current usage. If so, an s_waitcnt instruction needs to be @@ -949,6 +968,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( EmitSwaitcnt |= ScoreBrackets->updateByWait( VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); } + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { const MachineOperand &Op = MI.getOperand(I); const MachineRegisterInfo &MRIA = *MRI; @@ -973,6 +993,7 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( // 2) If a destination operand that was used by a recent export/store ins, // add s_waitcnt on exp_cnt to guarantee the WAR order. if (MI.mayStore()) { + // FIXME: Should not be relying on memoperands. for (const MachineMemOperand *Memop : MI.memoperands()) { unsigned AS = Memop->getAddrSpace(); if (AS != AMDGPUASI.LOCAL_ADDRESS) @@ -1094,7 +1115,8 @@ MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore( BlockWaitcntBracketsMap[TBB].get(); if (!ScoreBracket) { assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end()); - BlockWaitcntBracketsMap[TBB] = make_unique<BlockWaitcntBrackets>(); + BlockWaitcntBracketsMap[TBB] = + llvm::make_unique<BlockWaitcntBrackets>(); ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); } ScoreBracket->setRevisitLoop(true); @@ -1141,8 +1163,21 @@ void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB, } else { MBB.push_back(Waitcnt); } +} + +// This is a flat memory operation. Check to see if it has memory +// tokens for both LDS and Memory, and if so mark it as a flat. +bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { + if (MI.memoperands_empty()) + return true; - return; + for (const MachineMemOperand *Memop : MI.memoperands()) { + unsigned AS = Memop->getAddrSpace(); + if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) + return true; + } + + return false; } void SIInsertWaitcnts::updateEventWaitCntAfter( @@ -1151,10 +1186,8 @@ void SIInsertWaitcnts::updateEventWaitCntAfter( // instruction, update the upper-bound of the appropriate counter's // bracket and the destination operand scores. // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere. - uint64_t TSFlags = Inst.getDesc().TSFlags; - if (TII->isDS(Inst) && (TSFlags & SIInstrFlags::LGKM_CNT)) { - if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds) && - TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) { + if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) { + if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); } else { @@ -1162,23 +1195,18 @@ void SIInsertWaitcnts::updateEventWaitCntAfter( } } else if (TII->isFLAT(Inst)) { assert(Inst.mayLoad() || Inst.mayStore()); - ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); - ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); - // This is a flat memory operation. Check to see if it has memory - // tokens for both LDS and Memory, and if so mark it as a flat. - bool FoundLDSMem = false; - for (const MachineMemOperand *Memop : Inst.memoperands()) { - unsigned AS = Memop->getAddrSpace(); - if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) - FoundLDSMem = true; - } + if (TII->usesVM_CNT(Inst)) + ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst); + + if (TII->usesLGKM_CNT(Inst)) { + ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); - // This is a flat memory operation, so note it - it will require - // that both the VM and LGKM be flushed to zero if it is pending when - // a VM or LGKM dependency occurs. - if (FoundLDSMem) { - ScoreBrackets->setPendingFlat(); + // This is a flat memory operation, so note it - it will require + // that both the VM and LGKM be flushed to zero if it is pending when + // a VM or LGKM dependency occurs. + if (mayAccessLDSThroughFlat(Inst)) + ScoreBrackets->setPendingFlat(); } } else if (SIInstrInfo::isVMEM(Inst) && // TODO: get a better carve out. @@ -1241,7 +1269,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) { BlockWaitcntBracketsMap[pred].get(); bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end(); if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { - break; + continue; } for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; T = (enum InstCounterType)(T + 1)) { @@ -1280,7 +1308,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) { BlockWaitcntBracketsMap[Pred].get(); bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end(); if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { - break; + continue; } int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) - @@ -1327,7 +1355,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) { // Set the register scoreboard. for (MachineBasicBlock *Pred : Block.predecessors()) { if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) { - break; + continue; } BlockWaitcntBrackets *PredScoreBrackets = @@ -1441,7 +1469,7 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) { // the delayed nature of these operations. for (MachineBasicBlock *Pred : Block.predecessors()) { if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) { - break; + continue; } BlockWaitcntBrackets *PredScoreBrackets = @@ -1494,8 +1522,6 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, ScoreBrackets->dump(); }); - bool InsertNOP = false; - // Walk over the instructions. for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end(); Iter != E;) { @@ -1555,7 +1581,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, if (RequireCheckResourceType(Inst, context)) { // Force the score to as if an S_WAITCNT vmcnt(0) is emitted. ScoreBrackets->setScoreLB(VM_CNT, - ScoreBrackets->getScoreUB(VM_CNT)); + ScoreBrackets->getScoreUB(VM_CNT)); } #endif @@ -1596,58 +1622,6 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, VCCZBugHandledSet.insert(&Inst); } - if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { - - // This avoids a s_nop after a waitcnt has just been inserted. - if (!SWaitInst && InsertNOP) { - BuildMI(Block, Inst, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0); - } - InsertNOP = false; - - // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM - // or SMEM clause, respectively. - // - // The temporary workaround is to break the clauses with S_NOP. - // - // The proper solution would be to allocate registers such that all source - // and destination registers don't overlap, e.g. this is illegal: - // r0 = load r2 - // r2 = load r0 - bool IsSMEM = false; - bool IsVMEM = false; - if (TII->isSMRD(Inst)) - IsSMEM = true; - else if (TII->usesVM_CNT(Inst)) - IsVMEM = true; - - ++Iter; - if (Iter == E) - break; - - MachineInstr &Next = *Iter; - - // TODO: How about consecutive SMEM instructions? - // The comments above says break the clause but the code does not. - // if ((TII->isSMRD(next) && isSMEM) || - if (!IsSMEM && TII->usesVM_CNT(Next) && IsVMEM && - // TODO: Enable this check when hasSoftClause is upstreamed. - // ST->hasSoftClauses() && - ST->isXNACKEnabled()) { - // Insert a NOP to break the clause. - InsertNOP = true; - continue; - } - - // There must be "S_NOP 0" between an instruction writing M0 and - // S_SENDMSG. - if ((Next.getOpcode() == AMDGPU::S_SENDMSG || - Next.getOpcode() == AMDGPU::S_SENDMSGHALT) && - Inst.definesRegister(AMDGPU::M0)) - InsertNOP = true; - - continue; - } - ++Iter; } @@ -1752,13 +1726,13 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get(); if (!ScoreBrackets) { - BlockWaitcntBracketsMap[&MBB] = make_unique<BlockWaitcntBrackets>(); + BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(); ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get(); } ScoreBrackets->setPostOrder(MBB.getNumber()); MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB); if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr) - LoopWaitcntDataMap[ContainingLoop] = make_unique<LoopWaitcntData>(); + LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>(); // If we are walking into the block from before the loop, then guarantee // at least 1 re-walk over the loop to propagate the information, even if @@ -1819,12 +1793,10 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { - MachineBasicBlock &MBB = *BI; for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { - if (!HaveScalarStores && TII->isScalarStore(*I)) HaveScalarStores = true; @@ -1847,7 +1819,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ++I) { - if (I->getOpcode() == AMDGPU::S_DCACHE_WB) SeenDCacheWB = true; else if (TII->isScalarStore(*I)) diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp index bc86515d8b1f..b074b95c2d3c 100644 --- a/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -1,4 +1,4 @@ -//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// +//===- SILowerControlFlow.cpp - Use predicates for control flow -----------===// // // The LLVM Compiler Infrastructure // @@ -33,15 +33,14 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetRegisterInfo.h" #include <algorithm> #include <cassert> #include <cstdint> #include <cstring> -#include <new> #include <utility> #define DEBUG_TYPE "si-insert-waits" @@ -51,23 +50,23 @@ using namespace llvm; namespace { /// \brief One variable for each of the hardware counters -typedef union { +using Counters = union { struct { unsigned VM; unsigned EXP; unsigned LGKM; } Named; unsigned Array[3]; -} Counters; +}; -typedef enum { +using InstType = enum { OTHER, SMEM, VMEM -} InstType; +}; -typedef Counters RegCounters[512]; -typedef std::pair<unsigned, unsigned> RegInterval; +using RegCounters = Counters[512]; +using RegInterval = std::pair<unsigned, unsigned>; class SIInsertWaits : public MachineFunctionPass { private: @@ -409,7 +408,6 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, // Adjust the value to the real hardware possibilities. Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]); - } else Counts.Array[i] = 0; @@ -568,12 +566,10 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { - MachineBasicBlock &MBB = *BI; for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { - if (!HaveScalarStores && TII->isScalarStore(*I)) HaveScalarStores = true; @@ -671,7 +667,6 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ++I) { - if (I->getOpcode() == AMDGPU::S_DCACHE_WB) SeenDCacheWB = true; else if (TII->isScalarStore(*I)) diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td index 02c9b4b1f0ee..25917cc06e6a 100644 --- a/lib/Target/AMDGPU/SIInstrFormats.td +++ b/lib/Target/AMDGPU/SIInstrFormats.td @@ -11,9 +11,18 @@ // //===----------------------------------------------------------------------===// +def isGCN : Predicate<"Subtarget->getGeneration() " + ">= SISubtarget::SOUTHERN_ISLANDS">, + AssemblerPredicate<"FeatureGCN">; +def isSI : Predicate<"Subtarget->getGeneration() " + "== SISubtarget::SOUTHERN_ISLANDS">, + AssemblerPredicate<"FeatureSouthernIslands">; + + class InstSI <dag outs, dag ins, string asm = "", list<dag> pattern = []> : AMDGPUInst<outs, ins, asm, pattern>, PredicateControl { + let SubtargetPredicate = isGCN; // Low bits - basic encoding information. field bit SALU = 0; @@ -45,7 +54,7 @@ class InstSI <dag outs, dag ins, string asm = "", field bit FLAT = 0; field bit DS = 0; - // Pseudo instruction formats. + // Pseudo instruction formats. field bit VGPRSpill = 0; field bit SGPRSpill = 0; @@ -79,10 +88,36 @@ class InstSI <dag outs, dag ins, string asm = "", // is unable to infer the encoding from the operands. field bit VOPAsmPrefer32Bit = 0; + // This bit indicates that this is a VOP3 opcode which supports op_sel + // modifier (gfx9 only). + field bit VOP3_OPSEL = 0; + + // Is it possible for this instruction to be atomic? + field bit maybeAtomic = 0; + + // This bit indicates that this is a VI instruction which is renamed + // in GFX9. Required for correct mapping from pseudo to MC. + field bit renamedInGFX9 = 0; + // This bit indicates that this has a floating point result type, so // the clamp modifier has floating point semantics. field bit FPClamp = 0; + // This bit indicates that instruction may support integer clamping + // which depends on GPU features. + field bit IntClamp = 0; + + // This field indicates that the clamp applies to the low component + // of a packed output register. + field bit ClampLo = 0; + + // This field indicates that the clamp applies to the high component + // of a packed output register. + field bit ClampHi = 0; + + // This bit indicates that this is a packed VOP3P instruction + field bit IsPacked = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -126,7 +161,17 @@ class InstSI <dag outs, dag ins, string asm = "", let TSFlags{39} = ScalarStore; let TSFlags{40} = FixedSize; let TSFlags{41} = VOPAsmPrefer32Bit; - let TSFlags{42} = FPClamp; + let TSFlags{42} = VOP3_OPSEL; + + let TSFlags{43} = maybeAtomic; + let TSFlags{44} = renamedInGFX9; + + let TSFlags{45} = FPClamp; + let TSFlags{46} = IntClamp; + let TSFlags{47} = ClampLo; + let TSFlags{48} = ClampHi; + + let TSFlags{49} = IsPacked; let SchedRW = [Write32Bit]; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index a7e0feb10b9f..61967605432e 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1,4 +1,4 @@ -//===-- SIInstrInfo.cpp - SI Instruction Information ---------------------===// +//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// // // The LLVM Compiler Infrastructure // @@ -13,19 +13,52 @@ //===----------------------------------------------------------------------===// #include "SIInstrInfo.h" -#include "AMDGPUTargetMachine.h" +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "GCNHazardRecognizer.h" #include "SIDefines.h" #include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineInstrBundle.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/MC/MCInstrDesc.h" -#include "llvm/Support/Debug.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetMachine.h" +#include <cassert> +#include <cstdint> +#include <iterator> +#include <utility> using namespace llvm; @@ -305,26 +338,77 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, } if (isFLAT(LdSt)) { - const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); - BaseReg = AddrReg->getReg(); - Offset = 0; + const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); + if (VAddr) { + // Can't analyze 2 offsets. + if (getNamedOperand(LdSt, AMDGPU::OpName::saddr)) + return false; + + BaseReg = VAddr->getReg(); + } else { + // scratch instructions have either vaddr or saddr. + BaseReg = getNamedOperand(LdSt, AMDGPU::OpName::saddr)->getReg(); + } + + Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); return true; } return false; } +static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1, + const MachineInstr &MI2, unsigned BaseReg2) { + if (BaseReg1 == BaseReg2) + return true; + + if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) + return false; + + auto MO1 = *MI1.memoperands_begin(); + auto MO2 = *MI2.memoperands_begin(); + if (MO1->getAddrSpace() != MO2->getAddrSpace()) + return false; + + auto Base1 = MO1->getValue(); + auto Base2 = MO2->getValue(); + if (!Base1 || !Base2) + return false; + const MachineFunction &MF = *MI1.getParent()->getParent(); + const DataLayout &DL = MF.getFunction().getParent()->getDataLayout(); + Base1 = GetUnderlyingObject(Base1, DL); + Base2 = GetUnderlyingObject(Base1, DL); + + if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) + return false; + + return Base1 == Base2; +} + bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, + unsigned BaseReg1, MachineInstr &SecondLdSt, + unsigned BaseReg2, unsigned NumLoads) const { + if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2)) + return false; + const MachineOperand *FirstDst = nullptr; const MachineOperand *SecondDst = nullptr; if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { + const unsigned MaxGlobalLoadCluster = 6; + if (NumLoads > MaxGlobalLoadCluster) + return false; + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); + if (!FirstDst) + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); + if (!SecondDst) + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); @@ -358,10 +442,10 @@ static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) { MachineFunction *MF = MBB.getParent(); - DiagnosticInfoUnsupported IllegalCopy(*MF->getFunction(), + DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), "illegal SGPR to VGPR copy", DL, DS_Error); - LLVMContext &C = MF->getFunction()->getContext(); + LLVMContext &C = MF->getFunction().getContext(); C.diagnose(IllegalCopy); BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) @@ -452,7 +536,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } } - ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); @@ -566,15 +649,18 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, "Not a VGPR32 reg"); if (Cond.size() == 1) { + unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) + .add(Cond[0]); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) .addReg(FalseReg) .addReg(TrueReg) - .add(Cond[0]); + .addReg(SReg); } else if (Cond.size() == 2) { assert(Cond[0].isImm() && "Cond[0] is not an immediate"); switch (Cond[0].getImm()) { case SIInstrInfo::SCC_TRUE: { - unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) .addImm(-1) .addImm(0); @@ -585,7 +671,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, break; } case SIInstrInfo::SCC_FALSE: { - unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) .addImm(0) .addImm(-1); @@ -598,23 +684,29 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, case SIInstrInfo::VCCNZ: { MachineOperand RegOp = Cond[1]; RegOp.setImplicit(false); + unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) + .add(RegOp); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) .addReg(FalseReg) .addReg(TrueReg) - .add(RegOp); + .addReg(SReg); break; } case SIInstrInfo::VCCZ: { MachineOperand RegOp = Cond[1]; RegOp.setImplicit(false); + unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) + .add(RegOp); BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) .addReg(TrueReg) .addReg(FalseReg) - .add(RegOp); + .addReg(SReg); break; } case SIInstrInfo::EXECNZ: { - unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) .addImm(0); @@ -628,7 +720,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, break; } case SIInstrInfo::EXECZ: { - unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) .addImm(0); @@ -735,6 +827,10 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineFrameInfo &FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); + assert(SrcReg != MFI->getStackPtrOffsetReg() && + SrcReg != MFI->getFrameOffsetReg() && + SrcReg != MFI->getScratchWaveOffsetReg()); + unsigned Size = FrameInfo.getObjectSize(FrameIndex); unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); MachinePointerInfo PtrInfo @@ -768,6 +864,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, // needing them, and need to ensure that the reserved registers are // correctly handled. + FrameInfo.setStackID(FrameIndex, 1); if (ST.hasScalarStores()) { // m0 is used for offset to scalar stores if used to spill. Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead); @@ -776,8 +873,8 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, return; } - if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { - LLVMContext &Ctx = MF->getFunction()->getContext(); + if (!ST.isVGPRSpillingEnabled(MF->getFunction())) { + LLVMContext &Ctx = MF->getFunction().getContext(); Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" " spill register"); BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) @@ -863,6 +960,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); } + FrameInfo.setStackID(FrameIndex, 1); MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg) .addFrameIndex(FrameIndex) // addr .addMemOperand(MMO) @@ -877,8 +975,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, return; } - if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { - LLVMContext &Ctx = MF->getFunction()->getContext(); + if (!ST.isVGPRSpillingEnabled(MF->getFunction())) { + LLVMContext &Ctx = MF->getFunction().getContext(); Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" " restore register"); BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); @@ -904,7 +1002,6 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); DebugLoc DL = MBB.findDebugLoc(MI); unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); unsigned WavefrontSize = ST.getWavefrontSize(); @@ -920,17 +1017,16 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( if (TIDReg == AMDGPU::NoRegister) return TIDReg; - if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) && + if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) && WorkGroupSize > WavefrontSize) { - unsigned TIDIGXReg - = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); + = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X); unsigned TIDIGYReg - = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); + = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); unsigned TIDIGZReg - = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); + = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); unsigned InputPtrReg = - TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { if (!Entry.isLiveIn(Reg)) Entry.addLiveIn(Reg); @@ -961,9 +1057,9 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( .addReg(TIDIGYReg) .addReg(TIDReg); // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z - BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) - .addReg(TIDReg) - .addReg(TIDIGZReg); + getAddNoCarry(Entry, Insert, DL, TIDReg) + .addReg(TIDReg) + .addReg(TIDIGZReg); } else { // Get the wave id BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), @@ -986,9 +1082,9 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( // Add FrameIndex to LDS offset unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize); - BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) - .addImm(LDSOffset) - .addReg(TIDReg); + getAddNoCarry(MBB, MI, DL, TmpReg) + .addImm(LDSOffset) + .addReg(TIDReg); return TmpReg; } @@ -1042,24 +1138,24 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { DebugLoc DL = MBB.findDebugLoc(MI); switch (MI.getOpcode()) { default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); - case AMDGPU::S_MOV_B64_term: { + case AMDGPU::S_MOV_B64_term: // This is only a terminator to get the correct spill code placement during // register allocation. MI.setDesc(get(AMDGPU::S_MOV_B64)); break; - } - case AMDGPU::S_XOR_B64_term: { + + case AMDGPU::S_XOR_B64_term: // This is only a terminator to get the correct spill code placement during // register allocation. MI.setDesc(get(AMDGPU::S_XOR_B64)); break; - } - case AMDGPU::S_ANDN2_B64_term: { + + case AMDGPU::S_ANDN2_B64_term: // This is only a terminator to get the correct spill code placement during // register allocation. MI.setDesc(get(AMDGPU::S_ANDN2_B64)); break; - } + case AMDGPU::V_MOV_B64_PSEUDO: { unsigned Dst = MI.getOperand(0).getReg(); unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); @@ -1088,6 +1184,28 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } + case AMDGPU::V_SET_INACTIVE_B32: { + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) + .add(MI.getOperand(2)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + MI.eraseFromParent(); + break; + } + case AMDGPU::V_SET_INACTIVE_B64: { + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), + MI.getOperand(0).getReg()) + .add(MI.getOperand(2)); + expandPostRAPseudo(*Copy); + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + MI.eraseFromParent(); + break; + } case AMDGPU::V_MOVRELD_B32_V1: case AMDGPU::V_MOVRELD_B32_V2: case AMDGPU::V_MOVRELD_B32_V4: @@ -1140,11 +1258,17 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MIB.add(MI.getOperand(2)); Bundler.append(MIB); - llvm::finalizeBundle(MBB, Bundler.begin()); + finalizeBundle(MBB, Bundler.begin()); MI.eraseFromParent(); break; } + case AMDGPU::EXIT_WWM: { + // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM + // is exited. + MI.setDesc(get(AMDGPU::S_MOV_B64)); + break; + } } return true; } @@ -1232,7 +1356,6 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return nullptr; } - if (CommutedMI) { swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, Src1, AMDGPU::OpName::src1_modifiers); @@ -1542,7 +1665,6 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const { - if (!FBB && Cond.empty()) { BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) .addMBB(TBB); @@ -1760,6 +1882,23 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { } } +unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind( + PseudoSourceValue::PSVKind Kind) const { + switch(Kind) { + case PseudoSourceValue::Stack: + case PseudoSourceValue::FixedStack: + return AMDGPUASI.PRIVATE_ADDRESS; + case PseudoSourceValue::ConstantPool: + case PseudoSourceValue::GOT: + case PseudoSourceValue::JumpTable: + case PseudoSourceValue::GlobalValueCallEntry: + case PseudoSourceValue::ExternalSymbolCallEntry: + case PseudoSourceValue::TargetCustom: + return AMDGPUASI.CONSTANT_ADDRESS; + } + return AMDGPUASI.FLAT_ADDRESS; +} + static void removeModOperands(MachineInstr &MI) { unsigned Opc = MI.getOpcode(); int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, @@ -1779,28 +1918,29 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (!MRI->hasOneNonDBGUse(Reg)) return false; + switch (DefMI.getOpcode()) { + default: + return false; + case AMDGPU::S_MOV_B64: + // TODO: We could fold 64-bit immediates, but this get compilicated + // when there are sub-registers. + return false; + + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::S_MOV_B32: + break; + } + + const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); + assert(ImmOp); + // FIXME: We could handle FrameIndex values here. + if (!ImmOp->isImm()) + return false; + unsigned Opc = UseMI.getOpcode(); if (Opc == AMDGPU::COPY) { bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); - switch (DefMI.getOpcode()) { - default: - return false; - case AMDGPU::S_MOV_B64: - // TODO: We could fold 64-bit immediates, but this get compilicated - // when there are sub-registers. - return false; - - case AMDGPU::V_MOV_B32_e32: - case AMDGPU::S_MOV_B32: - break; - } unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; - const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0); - assert(ImmOp); - // FIXME: We could handle FrameIndex values here. - if (!ImmOp->isImm()) { - return false; - } UseMI.setDesc(get(NewOpc)); UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); @@ -1814,15 +1954,13 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (hasAnyModifiersSet(UseMI)) return false; - const MachineOperand &ImmOp = DefMI.getOperand(1); - // If this is a free constant, there's no reason to do this. // TODO: We could fold this here instead of letting SIFoldOperands do it // later. MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); // Any src operand can be used for the legality check. - if (isInlineConstant(UseMI, *Src0, ImmOp)) + if (isInlineConstant(UseMI, *Src0, *ImmOp)) return false; bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; @@ -1840,7 +1978,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, // We need to swap operands 0 and 1 since madmk constant is at operand 1. - const int64_t Imm = DefMI.getOperand(1).getImm(); + const int64_t Imm = ImmOp->getImm(); // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. @@ -1885,7 +2023,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) return false; - const int64_t Imm = DefMI.getOperand(1).getImm(); + const int64_t Imm = ImmOp->getImm(); // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. @@ -1985,7 +2123,7 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, if (isDS(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(MIb); + return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb); } if (isMUBUF(MIa) || isMTBUF(MIa)) { @@ -2012,6 +2150,18 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, return false; } +static int64_t getFoldableImm(const MachineOperand* MO) { + if (!MO->isReg()) + return false; + const MachineFunction *MF = MO->getParent()->getParent()->getParent(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + auto Def = MRI.getUniqueVRegDef(MO->getReg()); + if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 && + Def->getOperand(1).isImm()) + return Def->getOperand(1).getImm(); + return AMDGPU::NoRegister; +} + MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, MachineInstr &MI, LiveVariables *LV) const { @@ -2032,8 +2182,12 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); const MachineOperand *Src0 = &MI.getOperand(Src0Idx); + if (!Src0->isReg() && !Src0->isImm()) + return nullptr; + if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) return nullptr; + break; } } @@ -2049,6 +2203,37 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); + if (!Src0Mods && !Src1Mods && !Clamp && !Omod && + // If we have an SGPR input, we will violate the constant bus restriction. + (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { + if (auto Imm = getFoldableImm(Src2)) { + return BuildMI(*MBB, MI, MI.getDebugLoc(), + get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32)) + .add(*Dst) + .add(*Src0) + .add(*Src1) + .addImm(Imm); + } + if (auto Imm = getFoldableImm(Src1)) { + return BuildMI(*MBB, MI, MI.getDebugLoc(), + get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32)) + .add(*Dst) + .add(*Src0) + .addImm(Imm) + .add(*Src2); + } + if (auto Imm = getFoldableImm(Src0)) { + if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32, + AMDGPU::OpName::src0), Src1)) + return BuildMI(*MBB, MI, MI.getDebugLoc(), + get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32)) + .add(*Dst) + .add(*Src1) + .addImm(Imm) + .add(*Src2); + } + } + return BuildMI(*MBB, MI, MI.getDebugLoc(), get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32)) .add(*Dst) @@ -2133,10 +2318,9 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: - case AMDGPU::OPERAND_REG_INLINE_C_FP64: { + case AMDGPU::OPERAND_REG_INLINE_C_FP64: return AMDGPU::isInlinableLiteral64(MO.getImm(), ST.hasInv2PiInlineImm()); - } case AMDGPU::OPERAND_REG_IMM_INT16: case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: @@ -2439,7 +2623,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, // Verify SDWA if (isSDWA(MI)) { - if (!ST.hasSDWA()) { ErrInfo = "SDWA is not supported on this target"; return false; @@ -2504,6 +2687,28 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } } + + const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused); + if (DstUnused && DstUnused->isImm() && + DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) { + const MachineOperand &Dst = MI.getOperand(DstIdx); + if (!Dst.isReg() || !Dst.isTied()) { + ErrInfo = "Dst register should have tied register"; + return false; + } + + const MachineOperand &TiedMO = + MI.getOperand(MI.findTiedOperandIdx(DstIdx)); + if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) { + ErrInfo = + "Dst register should be tied to implicit use of preserved register"; + return false; + } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) && + Dst.getReg() != TiedMO.getReg()) { + ErrInfo = "Dst register should use same physical register as preserved"; + return false; + } + } } // Verify VOP* @@ -2648,21 +2853,30 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, return true; } -unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { +unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { switch (MI.getOpcode()) { default: return AMDGPU::INSTRUCTION_LIST_END; case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE; case AMDGPU::COPY: return AMDGPU::COPY; case AMDGPU::PHI: return AMDGPU::PHI; case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG; + case AMDGPU::WQM: return AMDGPU::WQM; + case AMDGPU::WWM: return AMDGPU::WWM; case AMDGPU::S_MOV_B32: return MI.getOperand(1).isReg() ? AMDGPU::COPY : AMDGPU::V_MOV_B32_e32; case AMDGPU::S_ADD_I32: - case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32; - case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32; + return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32; + case AMDGPU::S_ADDC_U32: + return AMDGPU::V_ADDC_U32_e32; case AMDGPU::S_SUB_I32: - case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32; + return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; + // FIXME: These are not consistently handled, and selected when the carry is + // used. + case AMDGPU::S_ADD_U32: + return AMDGPU::V_ADD_I32_e32; + case AMDGPU::S_SUB_U32: + return AMDGPU::V_SUB_I32_e32; case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32; case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32; case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64; @@ -2709,10 +2923,6 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { } } -bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const { - return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END; -} - const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI, unsigned OpNo) const { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); @@ -3090,7 +3300,6 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const { - unsigned OpReg = Op.getReg(); unsigned OpSubReg = Op.getSubReg(); @@ -3235,7 +3444,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { // scratch memory access. In both cases, the legalization never involves // conversion to the addr64 form. if (isMIMG(MI) || - (AMDGPU::isShader(MF.getFunction()->getCallingConv()) && + (AMDGPU::isShader(MF.getFunction().getCallingConv()) && (isMUBUF(MI) || isMTBUF(MI)))) { MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { @@ -3423,6 +3632,19 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { switch (Opcode) { default: break; + case AMDGPU::S_ADD_U64_PSEUDO: + case AMDGPU::S_SUB_U64_PSEUDO: + splitScalar64BitAddSub(Worklist, Inst); + Inst.eraseFromParent(); + continue; + case AMDGPU::S_ADD_I32: + case AMDGPU::S_SUB_I32: + // FIXME: The u32 versions currently selected use the carry. + if (moveScalarAddSub(Worklist, Inst)) + continue; + + // Default handling + break; case AMDGPU::S_AND_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); Inst.eraseFromParent(); @@ -3448,11 +3670,10 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { Inst.eraseFromParent(); continue; - case AMDGPU::S_BFE_I64: { + case AMDGPU::S_BFE_I64: splitScalar64BitBFE(Worklist, Inst); Inst.eraseFromParent(); continue; - } case AMDGPU::S_LSHL_B32: if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { @@ -3511,10 +3732,78 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { case AMDGPU::S_PACK_LL_B32_B16: case AMDGPU::S_PACK_LH_B32_B16: - case AMDGPU::S_PACK_HH_B32_B16: { + case AMDGPU::S_PACK_HH_B32_B16: movePackToVALU(Worklist, MRI, Inst); Inst.eraseFromParent(); continue; + + case AMDGPU::S_XNOR_B32: + lowerScalarXnor(Worklist, Inst); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_XNOR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32); + Inst.eraseFromParent(); + continue; + + case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR: { + unsigned VDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + const MachineOperand *VAddr = getNamedOperand(Inst, AMDGPU::OpName::soff); + auto Add = MRI.getUniqueVRegDef(VAddr->getReg()); + unsigned Offset = 0; + + // FIXME: This isn't safe because the addressing mode doesn't work + // correctly if vaddr is negative. + // + // FIXME: Handle v_add_u32 and VOP3 form. Also don't rely on immediate + // being in src0. + // + // FIXME: Should probably be done somewhere else, maybe SIFoldOperands. + // + // See if we can extract an immediate offset by recognizing one of these: + // V_ADD_I32_e32 dst, imm, src1 + // V_ADD_I32_e32 dst, (S_MOV_B32 imm), src1 + // V_ADD will be removed by "Remove dead machine instructions". + if (Add && Add->getOpcode() == AMDGPU::V_ADD_I32_e32) { + const MachineOperand *Src = + getNamedOperand(*Add, AMDGPU::OpName::src0); + + if (Src->isReg()) { + auto Mov = MRI.getUniqueVRegDef(Src->getReg()); + if (Mov && Mov->getOpcode() == AMDGPU::S_MOV_B32) + Src = &Mov->getOperand(1); + } + + if (Src) { + if (Src->isImm()) + Offset = Src->getImm(); + else if (Src->isCImm()) + Offset = Src->getCImm()->getZExtValue(); + } + + if (Offset && isLegalMUBUFImmOffset(Offset)) + VAddr = getNamedOperand(*Add, AMDGPU::OpName::src1); + else + Offset = 0; + } + + BuildMI(*MBB, Inst, Inst.getDebugLoc(), + get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), VDst) + .add(*VAddr) // vaddr + .add(*getNamedOperand(Inst, AMDGPU::OpName::sbase)) // srsrc + .addImm(0) // soffset + .addImm(Offset) // offset + .addImm(getNamedOperand(Inst, AMDGPU::OpName::glc)->getImm()) + .addImm(0) // slc + .addImm(0) // tfe + .setMemRefs(Inst.memoperands_begin(), Inst.memoperands_end()); + + MRI.replaceRegWith(getNamedOperand(Inst, AMDGPU::OpName::sdst)->getReg(), + VDst); + addUsersToMoveToVALUWorklist(VDst, MRI, Worklist); + Inst.eraseFromParent(); + continue; } } @@ -3610,6 +3899,41 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { } } +// Add/sub require special handling to deal with carry outs. +bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, + MachineInstr &Inst) const { + if (ST.hasAddNoCarry()) { + // Assume there is no user of scc since we don't select this in that case. + // Since scc isn't used, it doesn't really matter if the i32 or u32 variant + // is used. + + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + unsigned OldDstReg = Inst.getOperand(0).getReg(); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + unsigned Opc = Inst.getOpcode(); + assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32); + + unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ? + AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64; + + assert(Inst.getOperand(3).getReg() == AMDGPU::SCC); + Inst.RemoveOperand(3); + + Inst.setDesc(get(NewOpc)); + Inst.addImplicitDefUseOperands(*MBB.getParent()); + MRI.replaceRegWith(OldDstReg, ResultReg); + legalizeOperands(Inst); + + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); + return true; + } + + return false; +} + void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -3622,7 +3946,10 @@ void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) + unsigned SubOp = ST.hasAddNoCarry() ? + AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32; + + BuildMI(MBB, MII, DL, get(SubOp), TmpReg) .addImm(0) .addReg(Src.getReg()); @@ -3634,6 +3961,33 @@ void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } +void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, + MachineInstr &Inst) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + const DebugLoc &DL = Inst.getDebugLoc(); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + + legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL); + legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL); + + unsigned Xor = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, MII, DL, get(AMDGPU::V_XOR_B32_e64), Xor) + .add(Src0) + .add(Src1); + + unsigned Not = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, MII, DL, get(AMDGPU::V_NOT_B32_e64), Not) + .addReg(Xor); + + MRI.replaceRegWith(Dest.getReg(), Not); + addUsersToMoveToVALUWorklist(Not, MRI, Worklist); +} + void SIInstrInfo::splitScalar64BitUnaryOp( SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const { @@ -3685,6 +4039,74 @@ void SIInstrInfo::splitScalar64BitUnaryOp( addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } +void SIInstrInfo::splitScalar64BitAddSub( + SetVectorType &Worklist, MachineInstr &Inst) const { + bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); + + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + const DebugLoc &DL = Inst.getDebugLoc(); + MachineBasicBlock::iterator MII = Inst; + + const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); + const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); + const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0); + const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0); + + MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub0, Src0SubRC); + MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, + AMDGPU::sub0, Src1SubRC); + + + MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, + AMDGPU::sub1, Src0SubRC); + MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, + AMDGPU::sub1, Src1SubRC); + + unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; + MachineInstr *LoHalf = + BuildMI(MBB, MII, DL, get(LoOpc), DestSub0) + .addReg(CarryReg, RegState::Define) + .add(SrcReg0Sub0) + .add(SrcReg1Sub0); + + unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; + MachineInstr *HiHalf = + BuildMI(MBB, MII, DL, get(HiOpc), DestSub1) + .addReg(DeadCarryReg, RegState::Define | RegState::Dead) + .add(SrcReg0Sub1) + .add(SrcReg1Sub1) + .addReg(CarryReg, RegState::Kill); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + + // Try to legalize the operands in case we need to swap the order to keep it + // valid. + legalizeOperands(*LoHalf); + legalizeOperands(*HiHalf); + + // Move all users of this moved vlaue. + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); +} + void SIInstrInfo::splitScalar64BitBinaryOp( SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const { @@ -3936,8 +4358,8 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist( // This assumes that all the users of SCC are in the same block // as the SCC def. for (MachineInstr &MI : - llvm::make_range(MachineBasicBlock::iterator(SCCDefInst), - SCCDefInst.getParent()->end())) { + make_range(MachineBasicBlock::iterator(SCCDefInst), + SCCDefInst.getParent()->end())) { // Exit if we find another SCC def. if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1) return; @@ -3959,6 +4381,8 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( case AMDGPU::PHI: case AMDGPU::REG_SEQUENCE: case AMDGPU::INSERT_SUBREG: + case AMDGPU::WQM: + case AMDGPU::WWM: if (RI.hasVGPRs(NewDstRC)) return nullptr; @@ -4123,7 +4547,6 @@ unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI, unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const { - if (!MI.mayLoad()) return AMDGPU::NoRegister; @@ -4150,6 +4573,18 @@ unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI, return AMDGPU::NoRegister; } +unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const { + unsigned Size = 0; + MachineBasicBlock::const_instr_iterator I = MI.getIterator(); + MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); + while (++I != E && I->isInsideBundle()) { + assert(!I->isBundle() && "No nested bundle!"); + Size += getInstSizeInBytes(*I); + } + + return Size; +} + unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); @@ -4193,9 +4628,10 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { case TargetOpcode::IMPLICIT_DEF: case TargetOpcode::KILL: case TargetOpcode::DBG_VALUE: - case TargetOpcode::BUNDLE: case TargetOpcode::EH_LABEL: return 0; + case TargetOpcode::BUNDLE: + return getInstBundleSize(MI); case TargetOpcode::INLINEASM: { const MachineFunction *MF = MI.getParent()->getParent(); const char *AsmStr = MI.getOperand(0).getSymbolName(); @@ -4350,10 +4786,34 @@ SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg) const { - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + if (ST.hasAddNoCarry()) + return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + MRI.setRegAllocationHint(UnusedCarry, 0, AMDGPU::VCC); return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) .addReg(UnusedCarry, RegState::Define | RegState::Dead); } + +bool SIInstrInfo::isKillTerminator(unsigned Opcode) { + switch (Opcode) { + case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: + case AMDGPU::SI_KILL_I1_TERMINATOR: + return true; + default: + return false; + } +} + +const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const { + switch (Opcode) { + case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: + return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR); + case AMDGPU::SI_KILL_I1_PSEUDO: + return get(AMDGPU::SI_KILL_I1_TERMINATOR); + default: + llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO"); + } +} diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 3dd5bc89e6c7..24ee843e6ade 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -1,4 +1,4 @@ -//===-- SIInstrInfo.h - SI Instruction Info Interface -----------*- C++ -*-===// +//===- SIInstrInfo.h - SI Instruction Info Interface ------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -12,17 +12,33 @@ // //===----------------------------------------------------------------------===// - #ifndef LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H #define LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H #include "AMDGPUInstrInfo.h" #include "SIDefines.h" #include "SIRegisterInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/Compiler.h" +#include <cassert> +#include <cstdint> namespace llvm { +class APInt; +class MachineRegisterInfo; +class RegScavenger; +class SISubtarget; +class TargetRegisterClass; + class SIInstrInfo final : public AMDGPUInstrInfo { private: const SIRegisterInfo RI; @@ -39,11 +55,12 @@ private: EXECZ = 3 }; - typedef SmallSetVector<MachineInstr *, 32> SetVectorType; + using SetVectorType = SmallSetVector<MachineInstr *, 32>; static unsigned getBranchOpcode(BranchPredicate Cond); static BranchPredicate getBranchPredicate(unsigned Opcode); +public: unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, MachineOperand &SuperReg, @@ -56,15 +73,24 @@ private: const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const; - +private: void swapOperands(MachineInstr &Inst) const; + bool moveScalarAddSub(SetVectorType &Worklist, + MachineInstr &Inst) const; + void lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const; + void lowerScalarXnor(SetVectorType &Worklist, + MachineInstr &Inst) const; + void splitScalar64BitUnaryOp(SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const; + void splitScalar64BitAddSub(SetVectorType &Worklist, + MachineInstr &Inst) const; + void splitScalar64BitBinaryOp(SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const; @@ -76,9 +102,8 @@ private: MachineRegisterInfo &MRI, MachineInstr &Inst) const; - void addUsersToMoveToVALUWorklist( - unsigned Reg, MachineRegisterInfo &MRI, - SetVectorType &Worklist) const; + void addUsersToMoveToVALUWorklist(unsigned Reg, MachineRegisterInfo &MRI, + SetVectorType &Worklist) const; void addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst, @@ -101,7 +126,6 @@ protected: unsigned OpIdx1) const override; public: - enum TargetOperandFlags { MO_MASK = 0x7, @@ -120,7 +144,7 @@ public: MO_REL32_HI = 5 }; - explicit SIInstrInfo(const SISubtarget &); + explicit SIInstrInfo(const SISubtarget &ST); const SIRegisterInfo &getRegisterInfo() const { return RI; @@ -137,7 +161,8 @@ public: int64_t &Offset, const TargetRegisterInfo *TRI) const final; - bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt, + bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1, + MachineInstr &SecondLdSt, unsigned BaseReg2, unsigned NumLoads) const final; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, @@ -159,7 +184,7 @@ public: unsigned insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - unsigned SrcReg, int Value) const; + unsigned SrcReg, int Value) const; unsigned insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, @@ -228,7 +253,6 @@ public: bool reverseBranchCondition( SmallVectorImpl<MachineOperand> &Cond) const override; - bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef<MachineOperand> Cond, unsigned TrueReg, unsigned FalseReg, @@ -245,6 +269,9 @@ public: unsigned DstReg, ArrayRef<MachineOperand> Cond, unsigned TrueReg, unsigned FalseReg) const; + unsigned getAddressSpaceForPseudoSourceKind( + PseudoSourceValue::PSVKind Kind) const override; + bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA = nullptr) const override; @@ -392,6 +419,19 @@ public: return get(Opcode).TSFlags & SIInstrFlags::SMRD; } + bool isBufferSMRD(const MachineInstr &MI) const { + if (!isSMRD(MI)) + return false; + + // Check that it is using a buffer resource. + int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase); + if (Idx == -1) // e.g. s_memtime + return false; + + const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; + return RCID == AMDGPU::SReg_128RegClassID; + } + static bool isDS(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::DS; } @@ -420,6 +460,14 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::FLAT; } + // Is a FLAT encoded instruction which accesses a specific segment, + // i.e. global_* or scratch_*. + static bool isSegmentSpecificFLAT(const MachineInstr &MI) { + auto Flags = MI.getDesc().TSFlags; + return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT); + } + + // Any FLAT encoded instruction, including global_* and scratch_*. bool isFLAT(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::FLAT; } @@ -496,6 +544,10 @@ public: return MI.getDesc().TSFlags & SIInstrFlags::VM_CNT; } + static bool usesLGKM_CNT(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::LGKM_CNT; + } + static bool sopkIsZext(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::SOPK_ZEXT; } @@ -523,11 +575,23 @@ public: } static bool hasFPClamp(const MachineInstr &MI) { - return MI.getDesc().TSFlags & SIInstrFlags::HasFPClamp; + return MI.getDesc().TSFlags & SIInstrFlags::FPClamp; } bool hasFPClamp(uint16_t Opcode) const { - return get(Opcode).TSFlags & SIInstrFlags::HasFPClamp; + return get(Opcode).TSFlags & SIInstrFlags::FPClamp; + } + + static bool hasIntClamp(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::IntClamp; + } + + uint64_t getClampMask(const MachineInstr &MI) const { + const uint64_t ClampFlags = SIInstrFlags::FPClamp | + SIInstrFlags::IntClamp | + SIInstrFlags::ClampLo | + SIInstrFlags::ClampHi; + return MI.getDesc().TSFlags & ClampFlags; } bool isVGPRCopy(const MachineInstr &MI) const { @@ -630,9 +694,7 @@ public: bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override; - static unsigned getVALUOp(const MachineInstr &MI); - - bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const; + unsigned getVALUOp(const MachineInstr &MI) const; /// \brief Return the correct register class for \p OpNo. For target-specific /// instructions, this will return the register class that has been defined @@ -774,6 +836,7 @@ public: unsigned isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override; + unsigned getInstBundleSize(const MachineInstr &MI) const; unsigned getInstSizeInBytes(const MachineInstr &MI) const override; bool mayAccessFlatAddressSpace(const MachineInstr &MI) const; @@ -812,9 +875,17 @@ public: MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg) const; + + static bool isKillTerminator(unsigned Opcode); + const MCInstrDesc &getKillTerminatorFromPseudo(unsigned Opcode) const; + + static bool isLegalMUBUFImmOffset(unsigned Imm) { + return isUInt<12>(Imm); + } }; namespace AMDGPU { + LLVM_READONLY int getVOPe64(uint16_t Opcode); @@ -855,7 +926,8 @@ namespace AMDGPU { TF_LONG_BRANCH_FORWARD = 1 << 0, TF_LONG_BRANCH_BACKWARD = 1 << 1 }; -} // End namespace AMDGPU + +} // end namespace AMDGPU namespace SI { namespace KernelInputOffsets { @@ -873,9 +945,9 @@ enum Offsets { LOCAL_SIZE_Z = 32 }; -} // End namespace KernelInputOffsets -} // End namespace SI +} // end namespace KernelInputOffsets +} // end namespace SI -} // End namespace llvm +} // end namespace llvm -#endif +#endif // LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 088173680fa8..fc2d35d873aa 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -11,6 +11,9 @@ def isCI : Predicate<"Subtarget->getGeneration() " def isCIOnly : Predicate<"Subtarget->getGeneration() ==" "SISubtarget::SEA_ISLANDS">, AssemblerPredicate <"FeatureSeaIslands">; +def isVIOnly : Predicate<"Subtarget->getGeneration() ==" + "SISubtarget::VOLCANIC_ISLANDS">, + AssemblerPredicate <"FeatureVolcanicIslands">; def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">; @@ -22,6 +25,7 @@ def SIEncodingFamily { int VI = 1; int SDWA = 2; int SDWA9 = 3; + int GFX9 = 4; } //===----------------------------------------------------------------------===// @@ -89,6 +93,53 @@ def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad, [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>; +def SDTBufferStore : SDTypeProfile<0, 6, + [ // vdata + SDTCisVT<1, v4i32>, // rsrc + SDTCisVT<2, i32>, // vindex + SDTCisVT<3, i32>, // offset + SDTCisVT<4, i1>, // glc + SDTCisVT<5, i1>]>; // slc + +def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore, + [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>; +def SIbuffer_store_format : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT", SDTBufferStore, + [SDNPMemOperand, SDNPHasChain, SDNPMayStore]>; + +class SDBufferAtomic<string opcode> : SDNode <opcode, + SDTypeProfile<1, 5, + [SDTCisVT<0, i32>, // dst + SDTCisVT<1, i32>, // vdata + SDTCisVT<2, v4i32>, // rsrc + SDTCisVT<3, i32>, // vindex + SDTCisVT<4, i32>, // offset + SDTCisVT<5, i1>]>, // slc + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] +>; + +def SIbuffer_atomic_swap : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SWAP">; +def SIbuffer_atomic_add : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_ADD">; +def SIbuffer_atomic_sub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SUB">; +def SIbuffer_atomic_smin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SMIN">; +def SIbuffer_atomic_umin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMIN">; +def SIbuffer_atomic_smax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SMAX">; +def SIbuffer_atomic_umax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMAX">; +def SIbuffer_atomic_and : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_AND">; +def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">; +def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">; + +def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP", + SDTypeProfile<1, 6, + [SDTCisVT<0, i32>, // dst + SDTCisVT<1, i32>, // src + SDTCisVT<2, i32>, // cmp + SDTCisVT<3, v4i32>, // rsrc + SDTCisVT<4, i32>, // vindex + SDTCisVT<5, i32>, // offset + SDTCisVT<6, i1>]>, // slc + [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore] +>; + class SDSample<string opcode> : SDNode <opcode, SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v8i32>, SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]> @@ -110,81 +161,113 @@ def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET", defm atomic_inc_global : global_binary_atomic_op<SIatomic_inc>; defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>; +def atomic_inc_local : local_binary_atomic_op<SIatomic_inc>; +def atomic_dec_local : local_binary_atomic_op<SIatomic_dec>; + //===----------------------------------------------------------------------===// -// SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1 -// to be glued to the memory instructions. +// SDNodes PatFrags for loads/stores with a glue input. +// This is for SDNodes and PatFrag for local loads and stores to +// enable s_mov_b32 m0, -1 to be glued to the memory instructions. +// +// These mirror the regular load/store PatFrags and rely on special +// processing during Select() to add the glued copy. +// //===----------------------------------------------------------------------===// -def SIld_local : SDNode <"ISD::LOAD", SDTLoad, +def AMDGPUld_glue : SDNode <"ISD::LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] >; -def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{ - return cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; +def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr), [{ + return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED; }]>; -def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ - return cast<LoadSDNode>(N)->getAddressingMode() == ISD::UNINDEXED && - cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD; +def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr), [{ + return cast<LoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD; }]>; -def si_load_local_align8 : Aligned8Bytes < - (ops node:$ptr), (si_load_local node:$ptr) ->; +def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr), [{ + return cast<LoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD; +}]>; -def si_sextload_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ +def sextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr), [{ return cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD; }]>; -def si_az_extload_local : AZExtLoadBase <si_ld_local>; -multiclass SIExtLoadLocal <PatFrag ld_node> { +def zextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr), [{ + return cast<LoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD; +}]>; - def _i8 : PatFrag <(ops node:$ptr), (ld_node node:$ptr), - [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;}] - >; +def az_extload_glue : AZExtLoadBase <unindexedload_glue>; - def _i16 : PatFrag <(ops node:$ptr), (ld_node node:$ptr), - [{return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;}] - >; -} +def az_extloadi8_glue : PatFrag<(ops node:$ptr), (az_extload_glue node:$ptr), [{ + return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8; +}]>; -defm si_sextload_local : SIExtLoadLocal <si_sextload_local>; -defm si_az_extload_local : SIExtLoadLocal <si_az_extload_local>; +def az_extloadi16_glue : PatFrag<(ops node:$ptr), (az_extload_glue node:$ptr), [{ + return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16; +}]>; + +def sextloadi8_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr), [{ + return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8; +}]>; -def SIst_local : SDNode <"ISD::STORE", SDTStore, +def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr), [{ + return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16; +}]>; + +def load_glue_align8 : Aligned8Bytes < + (ops node:$ptr), (load_glue node:$ptr) +>; + + +def load_local_m0 : LoadFrag<load_glue>, LocalAddress; +def sextloadi8_local_m0 : LoadFrag<sextloadi8_glue>, LocalAddress; +def sextloadi16_local_m0 : LoadFrag<sextloadi16_glue>, LocalAddress; +def az_extloadi8_local_m0 : LoadFrag<az_extloadi8_glue>, LocalAddress; +def az_extloadi16_local_m0 : LoadFrag<az_extloadi16_glue>, LocalAddress; +def load_align8_local_m0 : LoadFrag <load_glue_align8>, LocalAddress; + + +def AMDGPUst_glue : SDNode <"ISD::STORE", SDTStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue] >; -def si_st_local : PatFrag < - (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{ - return cast<StoreSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS; +def unindexedstore_glue : PatFrag<(ops node:$val, node:$ptr), + (AMDGPUst_glue node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED; }]>; -def si_store_local : PatFrag < - (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{ - return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED && - !cast<StoreSDNode>(N)->isTruncatingStore(); +def store_glue : PatFrag<(ops node:$val, node:$ptr), + (unindexedstore_glue node:$val, node:$ptr), [{ + return !cast<StoreSDNode>(N)->isTruncatingStore(); }]>; -def si_store_local_align8 : Aligned8Bytes < - (ops node:$val, node:$ptr), (si_store_local node:$val, node:$ptr) ->; - -def si_truncstore_local : PatFrag < - (ops node:$val, node:$ptr), (si_st_local node:$val, node:$ptr), [{ +def truncstore_glue : PatFrag<(ops node:$val, node:$ptr), + (unindexedstore_glue node:$val, node:$ptr), [{ return cast<StoreSDNode>(N)->isTruncatingStore(); }]>; -def si_truncstore_local_i8 : PatFrag < - (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{ +def truncstorei8_glue : PatFrag<(ops node:$val, node:$ptr), + (truncstore_glue node:$val, node:$ptr), [{ return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i8; }]>; -def si_truncstore_local_i16 : PatFrag < - (ops node:$val, node:$ptr), (si_truncstore_local node:$val, node:$ptr), [{ +def truncstorei16_glue : PatFrag<(ops node:$val, node:$ptr), + (truncstore_glue node:$val, node:$ptr), [{ return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16; }]>; +def store_glue_align8 : Aligned8Bytes < + (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr) +>; + +def store_local_m0 : StoreFrag<store_glue>, LocalAddress; +def truncstorei8_local_m0 : StoreFrag<truncstorei8_glue>, LocalAddress; +def truncstorei16_local_m0 : StoreFrag<truncstorei16_glue>, LocalAddress; + +def store_align8_local_m0 : StoreFrag<store_glue_align8>, LocalAddress; + def si_setcc_uniform : PatFrag < (ops node:$lhs, node:$rhs, node:$cond), (setcc node:$lhs, node:$rhs, node:$cond), [{ @@ -199,16 +282,6 @@ def si_setcc_uniform : PatFrag < return true; }]>; -def si_uniform_br : PatFrag < - (ops node:$cond, node:$bb), (brcond node:$cond, node:$bb), [{ - return isUniformBr(N); -}]>; - -def si_uniform_br_scc : PatFrag < - (ops node:$cond, node:$bb), (si_uniform_br node:$cond, node:$bb), [{ - return isCBranchSCC(N); -}]>; - def lshr_rev : PatFrag < (ops node:$src1, node:$src0), (srl $src0, $src1) @@ -231,27 +304,28 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> { [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] >; - def _local : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; + def _local_m0 : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>; } -defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; -defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; -defm si_atomic_inc : SIAtomicM0Glue2 <"INC", 1>; -defm si_atomic_dec : SIAtomicM0Glue2 <"DEC", 1>; -defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; -defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">; -defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">; -defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">; -defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">; -defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; -defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; -defm si_atomic_swap : SIAtomicM0Glue2 <"SWAP">; +defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; +defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; +defm atomic_inc : SIAtomicM0Glue2 <"INC", 1>; +defm atomic_dec : SIAtomicM0Glue2 <"DEC", 1>; +defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; +defm atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">; +defm atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">; +defm atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">; +defm atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">; +defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; +defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; +defm atomic_swap : SIAtomicM0Glue2 <"SWAP">; -def si_atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, +def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] >; -defm si_atomic_cmp_swap : AtomicCmpSwapLocal <si_atomic_cmp_swap_glue>; +def atomic_cmp_swap_local_m0 : AtomicCmpSwapLocal<atomic_cmp_swap_glue>; + def as_i1imm : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1); @@ -273,6 +347,10 @@ def as_i64imm: SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64); }]>; +def cond_as_i32imm: SDNodeXForm<cond, [{ + return CurDAG->getTargetConstant(N->get(), SDLoc(N), MVT::i32); +}]>; + // Copied from the AArch64 backend: def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{ return CurDAG->getTargetConstant( @@ -556,6 +634,7 @@ def gds : NamedOperandBit<"GDS", NamedMatchClass<"GDS">>; def omod : NamedOperandU32<"OModSI", NamedMatchClass<"OModSI">>; def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>; +def highmod : NamedOperandBit<"High", NamedMatchClass<"High">>; def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>; def slc : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>; @@ -659,6 +738,15 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass> def Int32InputMods : IntInputMods<Int32InputModsMatchClass>; def Int64InputMods : IntInputMods<Int64InputModsMatchClass>; +class OpSelModsMatchClass : AsmOperandClass { + let Name = "OpSelMods"; + let ParserMethod = "parseRegOrImm"; + let PredicateMethod = "isRegOrImm"; +} + +def IntOpSelModsMatchClass : OpSelModsMatchClass; +def IntOpSelMods : InputMods<IntOpSelModsMatchClass>; + def FPRegSDWAInputModsMatchClass : AsmOperandClass { let Name = "SDWARegWithFPInputMods"; let ParserMethod = "parseRegWithFPInputMods"; @@ -750,6 +838,16 @@ def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">; def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">; def VOP3PMods0 : ComplexPattern<untyped, 3, "SelectVOP3PMods0">; +def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">; +def VOP3OpSel0 : ComplexPattern<untyped, 3, "SelectVOP3OpSel0">; + +def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">; +def VOP3OpSelMods0 : ComplexPattern<untyped, 3, "SelectVOP3OpSelMods0">; + +def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">; + + +def Hi16Elt : ComplexPattern<untyped, 1, "SelectHi16Elt">; //===----------------------------------------------------------------------===// // SI assembler operands @@ -771,6 +869,7 @@ def SRCMODS { int NEG_HI = ABS; int OP_SEL_0 = 4; int OP_SEL_1 = 8; + int DST_OP_SEL = 8; } def DSTCLAMP { @@ -827,7 +926,7 @@ class EXP_Helper<bit done, SDPatternOperator node = null_frag> : EXPCommon< // Split EXP instruction into EXP and EXP_DONE so we can set // mayLoad for done=1. multiclass EXP_m<bit done, SDPatternOperator node> { - let mayLoad = done in { + let mayLoad = done, DisableWQM = 1 in { let isPseudo = 1, isCodeGenOnly = 1 in { def "" : EXP_Helper<done, node>, SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.NONE>; @@ -943,7 +1042,7 @@ class getVOP3SrcForVT<ValueType VT> { VCSrc_f64, VCSrc_b64), !if(!eq(VT.Value, i1.Value), - SCSrc_b64, + SCSrc_i1, !if(isFP, !if(!eq(VT.Value, f16.Value), VCSrc_f16, @@ -1020,6 +1119,10 @@ class getSrcMod <ValueType VT> { ); } +class getOpSelMod <ValueType VT> { + Operand ret = !if(!eq(VT.Value, f16.Value), FP16InputMods, IntOpSelMods); +} + // Return type of input modifiers operand specified input operand for DPP class getSrcModExt <ValueType VT> { bit isFP = !if(!eq(VT.Value, f16.Value), 1, @@ -1048,7 +1151,7 @@ class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> { // Returns the input arguments for VOP3 instructions for the given SrcVT. class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, RegisterOperand Src2RC, int NumSrcArgs, - bit HasModifiers, bit HasOMod, + bit HasIntClamp, bit HasModifiers, bit HasOMod, Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> { dag ret = @@ -1063,7 +1166,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, clampmod:$clamp, omod:$omod) /* else */, // VOP1 without modifiers - (ins Src0RC:$src0) + !if (!eq(HasIntClamp, 1), + (ins Src0RC:$src0, clampmod:$clamp), + (ins Src0RC:$src0)) /* endif */ ), !if (!eq(NumSrcArgs, 2), !if (!eq(HasModifiers, 1), @@ -1077,7 +1182,10 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, clampmod:$clamp)) /* else */, // VOP2 without modifiers - (ins Src0RC:$src0, Src1RC:$src1) + !if (!eq(HasIntClamp, 1), + (ins Src0RC:$src0, Src1RC:$src1, clampmod:$clamp), + (ins Src0RC:$src0, Src1RC:$src1)) + /* endif */ ) /* NumSrcArgs == 3 */, !if (!eq(HasModifiers, 1), @@ -1093,7 +1201,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, clampmod:$clamp)) /* else */, // VOP3 without modifiers - (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2) + !if (!eq(HasIntClamp, 1), + (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2, clampmod:$clamp), + (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2)) /* endif */ )))); } @@ -1133,8 +1243,40 @@ class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC, ); } -class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, - bit HasModifiers, Operand Src0Mod, Operand Src1Mod> { +class getInsVOP3OpSel <RegisterOperand Src0RC, + RegisterOperand Src1RC, + RegisterOperand Src2RC, + int NumSrcArgs, + bit HasClamp, + Operand Src0Mod, + Operand Src1Mod, + Operand Src2Mod> { + dag ret = !if (!eq(NumSrcArgs, 2), + !if (HasClamp, + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp, + op_sel:$op_sel), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + op_sel:$op_sel)), + // else NumSrcArgs == 3 + !if (HasClamp, + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2, + clampmod:$clamp, + op_sel:$op_sel), + (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + Src1Mod:$src1_modifiers, Src1RC:$src1, + Src2Mod:$src2_modifiers, Src2RC:$src2, + op_sel:$op_sel)) + ); +} + +class getInsDPP <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC, + int NumSrcArgs, bit HasModifiers, + Operand Src0Mod, Operand Src1Mod> { dag ret = !if (!eq(NumSrcArgs, 0), // VOP1 without input operands (V_NOP) @@ -1143,26 +1285,29 @@ class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, !if (!eq(NumSrcArgs, 1), !if (!eq(HasModifiers, 1), // VOP1_DPP with modifiers - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, - dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + (ins DstRC:$old, Src0Mod:$src0_modifiers, + Src0RC:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl) /* else */, // VOP1_DPP without modifiers - (ins Src0RC:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, - bank_mask:$bank_mask, bound_ctrl:$bound_ctrl) + (ins DstRC:$old, Src0RC:$src0, + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl) /* endif */) /* NumSrcArgs == 2 */, !if (!eq(HasModifiers, 1), // VOP2_DPP with modifiers - (ins Src0Mod:$src0_modifiers, Src0RC:$src0, + (ins DstRC:$old, + Src0Mod:$src0_modifiers, Src0RC:$src0, Src1Mod:$src1_modifiers, Src1RC:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl) /* else */, // VOP2_DPP without modifiers - (ins Src0RC:$src0, Src1RC:$src1, dpp_ctrl:$dpp_ctrl, - row_mask:$row_mask, bank_mask:$bank_mask, - bound_ctrl:$bound_ctrl) + (ins DstRC:$old, + Src0RC:$src0, Src1RC:$src1, dpp_ctrl:$dpp_ctrl, + row_mask:$row_mask, bank_mask:$bank_mask, + bound_ctrl:$bound_ctrl) /* endif */))); } @@ -1246,7 +1391,7 @@ class getAsm32 <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> { // Returns the assembly string for the inputs and outputs of a VOP3 // instruction. -class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers, +class getAsm64 <bit HasDst, int NumSrcArgs, bit HasIntClamp, bit HasModifiers, bit HasOMod, ValueType DstVT = i32> { string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); @@ -1254,9 +1399,10 @@ class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers, !if(!eq(NumSrcArgs, 2), " $src1_modifiers", " $src1_modifiers,")); string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); + string iclamp = !if(HasIntClamp, "$clamp", ""); string ret = !if(!eq(HasModifiers, 0), - getAsm32<HasDst, NumSrcArgs, DstVT>.ret, + getAsm32<HasDst, NumSrcArgs, DstVT>.ret # iclamp, dst#", "#src0#src1#src2#"$clamp"#!if(HasOMod, "$omod", "")); } @@ -1279,6 +1425,34 @@ class getAsmVOP3P <bit HasDst, int NumSrcArgs, bit HasModifiers, string ret = dst#", "#src0#src1#src2#"$op_sel$op_sel_hi"#mods#clamp; } +class getAsmVOP3OpSel <int NumSrcArgs, + bit HasClamp, + bit Src0HasMods, + bit Src1HasMods, + bit Src2HasMods> { + string dst = " $vdst"; + + string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,"); + string isrc1 = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1", + " $src1,")); + string isrc2 = !if(!eq(NumSrcArgs, 3), " $src2", ""); + + string fsrc0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); + string fsrc1 = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1_modifiers", + " $src1_modifiers,")); + string fsrc2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); + + string src0 = !if(Src0HasMods, fsrc0, isrc0); + string src1 = !if(Src1HasMods, fsrc1, isrc1); + string src2 = !if(Src2HasMods, fsrc2, isrc2); + + string clamp = !if(HasClamp, "$clamp", ""); + + string ret = dst#", "#src0#src1#src2#"$op_sel"#clamp; +} + class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> { string dst = !if(HasDst, !if(!eq(DstVT.Size, 1), @@ -1433,6 +1607,10 @@ class VOPProfile <list<ValueType> _ArgVT> { field bit HasClamp = HasModifiers; field bit HasSDWAClamp = EmitDst; field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret; + field bit HasIntClamp = !if(isFloatType<DstVT>.ret, 0, HasClamp); + field bit HasClampLo = HasClamp; + field bit HasClampHi = BitAnd<isPackedType<DstVT>.ret, HasClamp>.ret; + field bit HasHigh = 0; field bit IsPacked = isPackedType<Src0VT>.ret; field bit HasOpSel = IsPacked; @@ -1457,13 +1635,18 @@ class VOPProfile <list<ValueType> _ArgVT> { field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret; field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, - HasModifiers, HasOMod, Src0Mod, Src1Mod, + HasIntClamp, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; field dag InsVOP3P = getInsVOP3P<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, HasClamp, Src0PackedMod, Src1PackedMod, Src2PackedMod>.ret; - - field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs, + field dag InsVOP3OpSel = getInsVOP3OpSel<Src0RC64, Src1RC64, Src2RC64, + NumSrcArgs, + HasClamp, + getOpSelMod<Src0VT>.ret, + getOpSelMod<Src1VT>.ret, + getOpSelMod<Src2VT>.ret>.ret; + field dag InsDPP = getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP>.ret; field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs, HasSDWAOMod, Src0ModSDWA, Src1ModSDWA, @@ -1471,8 +1654,13 @@ class VOPProfile <list<ValueType> _ArgVT> { field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret; - field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, HasOMod, DstVT>.ret; + field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasIntClamp, HasModifiers, HasOMod, DstVT>.ret; field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasModifiers, HasClamp, DstVT>.ret; + field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs, + HasClamp, + HasSrc0FloatMods, + HasSrc1FloatMods, + HasSrc2FloatMods>.ret; field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret; field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret; @@ -1495,6 +1683,8 @@ def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>; def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>; def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>; +def VOP_I32_I16_I16_I32 : VOPProfile <[i32, i16, i16, i32, untyped]>; + def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>; def VOP_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, untyped]>; def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>; @@ -1527,6 +1717,7 @@ def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>; +def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>; def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; @@ -1632,7 +1823,31 @@ def getBasicFromSDWAOp : InstrMapping { let ValueCols = [["Default"]]; } -def getMaskedMIMGOp : InstrMapping { +def getMaskedMIMGOp1 : InstrMapping { + let FilterClass = "MIMG_Mask"; + let RowFields = ["Op"]; + let ColFields = ["Channels"]; + let KeyCol = ["1"]; + let ValueCols = [["2"], ["3"], ["4"] ]; +} + +def getMaskedMIMGOp2 : InstrMapping { + let FilterClass = "MIMG_Mask"; + let RowFields = ["Op"]; + let ColFields = ["Channels"]; + let KeyCol = ["2"]; + let ValueCols = [["1"], ["3"], ["4"] ]; +} + +def getMaskedMIMGOp3 : InstrMapping { + let FilterClass = "MIMG_Mask"; + let RowFields = ["Op"]; + let ColFields = ["Channels"]; + let KeyCol = ["3"]; + let ValueCols = [["1"], ["2"], ["4"] ]; +} + +def getMaskedMIMGOp4 : InstrMapping { let FilterClass = "MIMG_Mask"; let RowFields = ["Op"]; let ColFields = ["Channels"]; @@ -1666,7 +1881,8 @@ def getMCOpcodeGen : InstrMapping { let ValueCols = [[!cast<string>(SIEncodingFamily.SI)], [!cast<string>(SIEncodingFamily.VI)], [!cast<string>(SIEncodingFamily.SDWA)], - [!cast<string>(SIEncodingFamily.SDWA9)]]; + [!cast<string>(SIEncodingFamily.SDWA9)], + [!cast<string>(SIEncodingFamily.GFX9)]]; } // Get equivalent SOPK instruction. @@ -1705,7 +1921,6 @@ def getAtomicNoRetOp : InstrMapping { } include "SIInstructions.td" -include "CIInstructions.td" include "DSInstructions.td" include "MIMGInstructions.td" diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index ba69e42d9125..9740a18b7248 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -11,13 +11,6 @@ // that are not yet supported remain commented out. //===----------------------------------------------------------------------===// -def isGCN : Predicate<"Subtarget->getGeneration() " - ">= SISubtarget::SOUTHERN_ISLANDS">, - AssemblerPredicate<"FeatureGCN">; -def isSI : Predicate<"Subtarget->getGeneration() " - "== SISubtarget::SOUTHERN_ISLANDS">, - AssemblerPredicate<"FeatureSouthernIslands">; - def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">, @@ -25,14 +18,17 @@ def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">, def HasMovrel : Predicate<"Subtarget->hasMovrel()">, AssemblerPredicate<"FeatureMovrel">; +class GCNPat<dag pattern, dag result> : AMDGPUPat<pattern, result> { + let SubtargetPredicate = isGCN; +} + + include "VOPInstructions.td" include "SOPInstructions.td" include "SMInstructions.td" include "FLATInstructions.td" include "BUFInstructions.td" -let SubtargetPredicate = isGCN in { - //===----------------------------------------------------------------------===// // EXP Instructions //===----------------------------------------------------------------------===// @@ -99,6 +95,7 @@ def ATOMIC_FENCE : SPseudoInstSI< [(atomic_fence (i32 imm:$ordering), (i32 imm:$scope))], "ATOMIC_FENCE $ordering, $scope"> { let hasSideEffects = 1; + let maybeAtomic = 1; } let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { @@ -111,12 +108,67 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), let usesCustomInserter = 1; } -// 64-bit vector move instruction. This is mainly used by the SIFoldOperands -// pass to enable folding of inline immediates. +// 64-bit vector move instruction. This is mainly used by the +// SIFoldOperands pass to enable folding of inline immediates. def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_b64:$src0)>; + +// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the +// WQM pass processes it. +def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; + +// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so +// that the @earlyclobber is respected. The @earlyclobber is to make sure that +// the instruction that defines $src0 (which is run in WWM) doesn't +// accidentally clobber inactive channels of $vdst. +let Constraints = "@earlyclobber $vdst" in { +def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; +} + } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] +def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> { + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; +} + +// Invert the exec mask and overwrite the inactive lanes of dst with inactive, +// restoring it after we're done. +def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins VGPR_32: $src, VSrc_b32:$inactive), + [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> { + let Constraints = "$src = $vdst"; +} + +def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), + (ins VReg_64: $src, VSrc_b64:$inactive), + [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> { + let Constraints = "$src = $vdst"; +} + + +let usesCustomInserter = 1, Defs = [SCC] in { +def S_ADD_U64_PSEUDO : SPseudoInstSI < + (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), + [(set SReg_64:$vdst, (add i64:$src0, i64:$src1))] +>; + +def S_SUB_U64_PSEUDO : SPseudoInstSI < + (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), + [(set SReg_64:$vdst, (sub i64:$src0, i64:$src1))] +>; + +def S_ADD_U64_CO_PSEUDO : SPseudoInstSI < + (outs SReg_64:$vdst, VOPDstS64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) +>; + +def S_SUB_U64_CO_PSEUDO : SPseudoInstSI < + (outs SReg_64:$vdst, VOPDstS64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) +>; + +} // End usesCustomInserter = 1, Defs = [SCC] + let usesCustomInserter = 1, SALU = 1 in { def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins), [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; @@ -174,12 +226,14 @@ def SI_MASK_BRANCH : VPseudoInstSI < let isTerminator = 1 in { +let OtherPredicates = [EnableLateCFGStructurize] in { def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI < (outs), (ins SReg_64:$vcc, brtarget:$target), [(brcond i1:$vcc, bb:$target)]> { let Size = 12; } +} def SI_IF: CFPseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target), @@ -243,18 +297,21 @@ def SI_ELSE_BREAK : CFPseudoInstSI < } let Uses = [EXEC], Defs = [EXEC,VCC] in { -def SI_KILL : PseudoInstSI < - (outs), (ins VSrc_b32:$src), - [(AMDGPUkill i32:$src)]> { - let isConvergent = 1; - let usesCustomInserter = 1; -} -def SI_KILL_TERMINATOR : SPseudoInstSI < - (outs), (ins VSrc_b32:$src)> { - let isTerminator = 1; +multiclass PseudoInstKill <dag ins> { + def _PSEUDO : PseudoInstSI <(outs), ins> { + let isConvergent = 1; + let usesCustomInserter = 1; + } + + def _TERMINATOR : SPseudoInstSI <(outs), ins> { + let isTerminator = 1; + } } +defm SI_KILL_I1 : PseudoInstKill <(ins SSrc_b64:$src, i1imm:$killvalue)>; +defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; + def SI_ILLEGAL_COPY : SPseudoInstSI < (outs unknown:$dst), (ins unknown:$src), [], " ; illegal copy $src to $dst">; @@ -316,6 +373,82 @@ def SI_RETURN_TO_EPILOG : SPseudoInstSI < let DisableWQM = 1; } +// Return for returning function calls. +def SI_RETURN : SPseudoInstSI < + (outs), (ins), [], + "; return"> { + let isTerminator = 1; + let isBarrier = 1; + let isReturn = 1; + let SchedRW = [WriteBranch]; +} + +// Return for returning function calls without output register. +// +// This version is only needed so we can fill in the output regiter in +// the custom inserter. +def SI_CALL_ISEL : SPseudoInstSI < + (outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)]> { + let Size = 4; + let isCall = 1; + let SchedRW = [WriteBranch]; + let usesCustomInserter = 1; +} + +// Wrapper around s_swappc_b64 with extra $callee parameter to track +// the called function after regalloc. +def SI_CALL : SPseudoInstSI < + (outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> { + let Size = 4; + let isCall = 1; + let UseNamedOperandTable = 1; + let SchedRW = [WriteBranch]; +} + +// Tail call handling pseudo +def SI_TCRETURN_ISEL : SPseudoInstSI<(outs), + (ins SSrc_b64:$src0, i32imm:$fpdiff), + [(AMDGPUtc_return i64:$src0, i32:$fpdiff)]> { + let isCall = 1; + let isTerminator = 1; + let isReturn = 1; + let isBarrier = 1; + let SchedRW = [WriteBranch]; + let usesCustomInserter = 1; +} + +def SI_TCRETURN : SPseudoInstSI < + (outs), + (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff)> { + let Size = 4; + let isCall = 1; + let isTerminator = 1; + let isReturn = 1; + let isBarrier = 1; + let UseNamedOperandTable = 1; + let SchedRW = [WriteBranch]; +} + + +def ADJCALLSTACKUP : SPseudoInstSI< + (outs), (ins i32imm:$amt0, i32imm:$amt1), + [(callseq_start timm:$amt0, timm:$amt1)], + "; adjcallstackup $amt0 $amt1"> { + let Size = 8; // Worst case. (s_add_u32 + constant) + let FixedSize = 1; + let hasSideEffects = 1; + let usesCustomInserter = 1; +} + +def ADJCALLSTACKDOWN : SPseudoInstSI< + (outs), (ins i32imm:$amt1, i32imm:$amt2), + [(callseq_end timm:$amt1, timm:$amt2)], + "; adjcallstackdown $amt1"> { + let Size = 8; // Worst case. (s_add_u32 + constant) + let hasSideEffects = 1; + let usesCustomInserter = 1; +} + let Defs = [M0, EXEC], UseNamedOperandTable = 1 in { @@ -416,39 +549,63 @@ def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < let Defs = [SCC]; } -} // End SubtargetPredicate = isGCN - -let Predicates = [isGCN] in { -def : Pat < +def : GCNPat < (AMDGPUinit_exec i64:$src), (SI_INIT_EXEC (as_i64imm $src)) >; -def : Pat < +def : GCNPat < (AMDGPUinit_exec_from_input i32:$input, i32:$shift), (SI_INIT_EXEC_FROM_INPUT (i32 $input), (as_i32imm $shift)) >; -def : Pat< +def : GCNPat< (AMDGPUtrap timm:$trapid), (S_TRAP $trapid) >; -def : Pat< +def : GCNPat< (AMDGPUelse i64:$src, bb:$target), (SI_ELSE $src, $target, 0) >; -def : Pat < +def : GCNPat < (int_AMDGPU_kilp), - (SI_KILL (i32 0xbf800000)) + (SI_KILL_I1_PSEUDO (i1 0), 0) +>; + +def : Pat < + // -1.0 as i32 (LowerINTRINSIC_VOID converts all other constants to -1.0) + (AMDGPUkill (i32 -1082130432)), + (SI_KILL_I1_PSEUDO (i1 0), 0) +>; + +def : Pat < + (int_amdgcn_kill i1:$src), + (SI_KILL_I1_PSEUDO $src, 0) +>; + +def : Pat < + (int_amdgcn_kill (i1 (not i1:$src))), + (SI_KILL_I1_PSEUDO $src, -1) +>; + +def : Pat < + (AMDGPUkill i32:$src), + (SI_KILL_F32_COND_IMM_PSEUDO $src, 0, 3) // 3 means SETOGE +>; + +def : Pat < + (int_amdgcn_kill (i1 (setcc f32:$src, InlineFPImm<f32>:$imm, cond:$cond))), + (SI_KILL_F32_COND_IMM_PSEUDO $src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond)) >; +// TODO: we could add more variants for other types of conditionals //===----------------------------------------------------------------------===// // VOP1 Patterns //===----------------------------------------------------------------------===// -let Predicates = [UnsafeFPMath] in { +let SubtargetPredicate = isGCN, OtherPredicates = [UnsafeFPMath] in { //def : RcpPat<V_RCP_F64_e32, f64>; //defm : RsqPat<V_RSQ_F64_e32, f64>; @@ -458,70 +615,70 @@ def : RsqPat<V_RSQ_F32_e32, f32>; def : RsqPat<V_RSQ_F64_e32, f64>; // Convert (x - floor(x)) to fract(x) -def : Pat < +def : GCNPat < (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) >; // Convert (x + (-floor(x))) to fract(x) -def : Pat < +def : GCNPat < (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) >; -} // End Predicates = [UnsafeFPMath] +} // End SubtargetPredicate = isGCN, OtherPredicates = [UnsafeFPMath] // f16_to_fp patterns -def : Pat < +def : GCNPat < (f32 (f16_to_fp i32:$src0)), (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) >; -def : Pat < +def : GCNPat < (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))), (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) >; -def : Pat < +def : GCNPat < (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))), (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) >; -def : Pat < +def : GCNPat < (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))), (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0, DSTCLAMP.NONE, DSTOMOD.NONE) >; -def : Pat < +def : GCNPat < (f64 (fpextend f16:$src)), (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src)) >; // fp_to_fp16 patterns -def : Pat < +def : GCNPat < (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, DSTCLAMP.NONE, DSTOMOD.NONE) >; -def : Pat < +def : GCNPat < (i32 (fp_to_sint f16:$src)), (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 $src)) >; -def : Pat < +def : GCNPat < (i32 (fp_to_uint f16:$src)), (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 $src)) >; -def : Pat < +def : GCNPat < (f16 (sint_to_fp i32:$src)), (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 $src)) >; -def : Pat < +def : GCNPat < (f16 (uint_to_fp i32:$src)), (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 $src)) >; @@ -531,7 +688,7 @@ def : Pat < //===----------------------------------------------------------------------===// multiclass FMADPat <ValueType vt, Instruction inst> { - def : Pat < + def : GCNPat < (vt (fmad (VOP3NoMods vt:$src0), (VOP3NoMods vt:$src1), (VOP3NoMods vt:$src2))), @@ -543,7 +700,7 @@ multiclass FMADPat <ValueType vt, Instruction inst> { defm : FMADPat <f16, V_MAC_F16_e64>; defm : FMADPat <f32, V_MAC_F32_e64>; -class FMADModsPat<Instruction inst, SDPatternOperator mad_opr> : Pat< +class FMADModsPat<Instruction inst, SDPatternOperator mad_opr> : GCNPat< (f32 (mad_opr (VOP3Mods f32:$src0, i32:$src0_mod), (VOP3Mods f32:$src1, i32:$src1_mod), (VOP3Mods f32:$src2, i32:$src2_mod))), @@ -554,7 +711,7 @@ class FMADModsPat<Instruction inst, SDPatternOperator mad_opr> : Pat< def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz>; multiclass SelectPat <ValueType vt, Instruction inst> { - def : Pat < + def : GCNPat < (vt (select i1:$src0, vt:$src1, vt:$src2)), (inst $src2, $src1, $src0) >; @@ -565,7 +722,7 @@ defm : SelectPat <i32, V_CNDMASK_B32_e64>; defm : SelectPat <f16, V_CNDMASK_B32_e64>; defm : SelectPat <f32, V_CNDMASK_B32_e64>; -def : Pat < +def : GCNPat < (i32 (add (i32 (ctpop i32:$popcnt)), i32:$val)), (V_BCNT_U32_B32_e64 $popcnt, $val) >; @@ -638,6 +795,8 @@ foreach Index = 0-15 in { >; } +let SubtargetPredicate = isGCN in { + // FIXME: Why do only some of these type combinations for SReg and // VReg? // 16-bit bitcast @@ -698,6 +857,8 @@ def : BitConvert <v8f32, v8i32, VReg_256>; def : BitConvert <v16i32, v16f32, VReg_512>; def : BitConvert <v16f32, v16i32, VReg_512>; +} // End SubtargetPredicate = isGCN + /********** =================== **********/ /********** Src & Dst modifiers **********/ /********** =================== **********/ @@ -705,7 +866,7 @@ def : BitConvert <v16f32, v16i32, VReg_512>; // If denormals are not enabled, it only impacts the compare of the // inputs. The output result is not flushed. -class ClampPat<Instruction inst, ValueType vt> : Pat < +class ClampPat<Instruction inst, ValueType vt> : GCNPat < (vt (AMDGPUclamp (VOP3Mods vt:$src0, i32:$src0_modifiers))), (inst i32:$src0_modifiers, vt:$src0, i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, DSTOMOD.NONE) @@ -715,19 +876,25 @@ def : ClampPat<V_MAX_F32_e64, f32>; def : ClampPat<V_MAX_F64, f64>; def : ClampPat<V_MAX_F16_e64, f16>; +def : GCNPat < + (v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))), + (V_PK_MAX_F16 $src0_modifiers, $src0, + $src0_modifiers, $src0, DSTCLAMP.ENABLE) +>; + /********** ================================ **********/ /********** Floating point absolute/negative **********/ /********** ================================ **********/ // Prevent expanding both fneg and fabs. -def : Pat < +def : GCNPat < (fneg (fabs f32:$src)), (S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit >; // FIXME: Should use S_OR_B32 -def : Pat < +def : GCNPat < (fneg (fabs f64:$src)), (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG f64:$src, sub0)), @@ -737,17 +904,17 @@ def : Pat < sub1) >; -def : Pat < +def : GCNPat < (fabs f32:$src), (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff))) >; -def : Pat < +def : GCNPat < (fneg f32:$src), (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000))) >; -def : Pat < +def : GCNPat < (fabs f64:$src), (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG f64:$src, sub0)), @@ -757,7 +924,7 @@ def : Pat < sub1) >; -def : Pat < +def : GCNPat < (fneg f64:$src), (REG_SEQUENCE VReg_64, (i32 (EXTRACT_SUBREG f64:$src, sub0)), @@ -767,18 +934,18 @@ def : Pat < sub1) >; -def : Pat < +def : GCNPat < (fcopysign f16:$src0, f16:$src1), (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1) >; -def : Pat < +def : GCNPat < (fcopysign f32:$src0, f16:$src1), (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), $src0, (V_LSHLREV_B32_e64 (i32 16), $src1)) >; -def : Pat < +def : GCNPat < (fcopysign f64:$src0, f16:$src1), (REG_SEQUENCE SReg_64, (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, @@ -786,39 +953,39 @@ def : Pat < (V_LSHLREV_B32_e64 (i32 16), $src1)), sub1) >; -def : Pat < +def : GCNPat < (fcopysign f16:$src0, f32:$src1), (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, (V_LSHRREV_B32_e64 (i32 16), $src1)) >; -def : Pat < +def : GCNPat < (fcopysign f16:$src0, f64:$src1), (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) >; -def : Pat < +def : GCNPat < (fneg f16:$src), (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000))) >; -def : Pat < +def : GCNPat < (fabs f16:$src), (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x00007fff))) >; -def : Pat < +def : GCNPat < (fneg (fabs f16:$src)), (S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit >; -def : Pat < +def : GCNPat < (fneg v2f16:$src), (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), $src) >; -def : Pat < +def : GCNPat < (fabs v2f16:$src), (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src) >; @@ -827,7 +994,7 @@ def : Pat < // // fabs is not reported as free because there is modifier for it in // VOP3P instructions, so it is turned into the bit op. -def : Pat < +def : GCNPat < (fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))), (S_OR_B32 (S_MOV_B32 (i32 0x80008000)), $src) // Set sign bit >; @@ -836,17 +1003,17 @@ def : Pat < /********** Immediate Patterns **********/ /********** ================== **********/ -def : Pat < +def : GCNPat < (VGPRImm<(i32 imm)>:$imm), (V_MOV_B32_e32 imm:$imm) >; -def : Pat < +def : GCNPat < (VGPRImm<(f32 fpimm)>:$imm), (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm))) >; -def : Pat < +def : GCNPat < (i32 imm:$imm), (S_MOV_B32 imm:$imm) >; @@ -854,27 +1021,27 @@ def : Pat < // FIXME: Workaround for ordering issue with peephole optimizer where // a register class copy interferes with immediate folding. Should // use s_mov_b32, which can be shrunk to s_movk_i32 -def : Pat < +def : GCNPat < (VGPRImm<(f16 fpimm)>:$imm), (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm))) >; -def : Pat < +def : GCNPat < (f32 fpimm:$imm), (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) >; -def : Pat < +def : GCNPat < (f16 fpimm:$imm), (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm))) >; -def : Pat < +def : GCNPat < (i32 frameindex:$fi), (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi))) >; -def : Pat < +def : GCNPat < (i64 InlineImm<i64>:$imm), (S_MOV_B64 InlineImm<i64>:$imm) >; @@ -882,12 +1049,12 @@ def : Pat < // XXX - Should this use a s_cmp to set SCC? // Set to sign-extended 64-bit value (true = -1, false = 0) -def : Pat < +def : GCNPat < (i1 imm:$imm), (S_MOV_B64 (i64 (as_i64imm $imm))) >; -def : Pat < +def : GCNPat < (f64 InlineFPImm<f64>:$imm), (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineFPImm<f64>:$imm))) >; @@ -896,14 +1063,16 @@ def : Pat < /********** Intrinsic Patterns **********/ /********** ================== **********/ +let SubtargetPredicate = isGCN in { def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>; +} -def : Pat < +def : GCNPat < (i32 (sext i1:$src0)), (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0) >; -class Ext32Pat <SDNode ext> : Pat < +class Ext32Pat <SDNode ext> : GCNPat < (i32 (ext i1:$src0)), (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src0) >; @@ -912,7 +1081,7 @@ def : Ext32Pat <zext>; def : Ext32Pat <anyext>; // The multiplication scales from [0,1] to the unsigned integer range -def : Pat < +def : GCNPat < (AMDGPUurecip i32:$src0), (V_CVT_U32_F32_e32 (V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1), @@ -923,17 +1092,21 @@ def : Pat < // VOP3 Patterns //===----------------------------------------------------------------------===// -def : IMad24Pat<V_MAD_I32_I24>; -def : UMad24Pat<V_MAD_U32_U24>; +let SubtargetPredicate = isGCN in { + +def : IMad24Pat<V_MAD_I32_I24, 1>; +def : UMad24Pat<V_MAD_U32_U24, 1>; defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>; def : ROTRPattern <V_ALIGNBIT_B32>; -def : Pat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), +} + +def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; -def : Pat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), +def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; @@ -943,13 +1116,13 @@ def : Pat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> { // Extract with offset - def : Pat< + def : GCNPat< (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))), (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset) >; // Insert with offset - def : Pat< + def : GCNPat< (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))), (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val) >; @@ -969,70 +1142,70 @@ defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">; // SAD Patterns //===----------------------------------------------------------------------===// -def : Pat < +def : GCNPat < (add (sub_oneuse (umax i32:$src0, i32:$src1), (umin i32:$src0, i32:$src1)), i32:$src2), - (V_SAD_U32 $src0, $src1, $src2) + (V_SAD_U32 $src0, $src1, $src2, (i1 0)) >; -def : Pat < +def : GCNPat < (add (select_oneuse (i1 (setugt i32:$src0, i32:$src1)), (sub i32:$src0, i32:$src1), (sub i32:$src1, i32:$src0)), i32:$src2), - (V_SAD_U32 $src0, $src1, $src2) + (V_SAD_U32 $src0, $src1, $src2, (i1 0)) >; //===----------------------------------------------------------------------===// // Conversion Patterns //===----------------------------------------------------------------------===// -def : Pat<(i32 (sext_inreg i32:$src, i1)), +def : GCNPat<(i32 (sext_inreg i32:$src, i1)), (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16 // Handle sext_inreg in i64 -def : Pat < +def : GCNPat < (i64 (sext_inreg i64:$src, i1)), (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16 >; -def : Pat < +def : GCNPat < (i16 (sext_inreg i16:$src, i1)), (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16 >; -def : Pat < +def : GCNPat < (i16 (sext_inreg i16:$src, i8)), (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16 >; -def : Pat < +def : GCNPat < (i64 (sext_inreg i64:$src, i8)), (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16 >; -def : Pat < +def : GCNPat < (i64 (sext_inreg i64:$src, i16)), (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16 >; -def : Pat < +def : GCNPat < (i64 (sext_inreg i64:$src, i32)), (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16 >; -def : Pat < +def : GCNPat < (i64 (zext i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1) >; -def : Pat < +def : GCNPat < (i64 (anyext i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1) >; -class ZExt_i64_i1_Pat <SDNode ext> : Pat < +class ZExt_i64_i1_Pat <SDNode ext> : GCNPat < (i64 (ext i1:$src)), (REG_SEQUENCE VReg_64, (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0, @@ -1045,20 +1218,20 @@ def : ZExt_i64_i1_Pat<anyext>; // FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that // REG_SEQUENCE patterns don't support instructions with multiple outputs. -def : Pat < +def : GCNPat < (i64 (sext i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1) >; -def : Pat < +def : GCNPat < (i64 (sext i1:$src)), (REG_SEQUENCE VReg_64, (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0, (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1) >; -class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat < +class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : GCNPat < (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))), (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE)) >; @@ -1074,37 +1247,37 @@ def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>; // 64-bit comparisons. When legalizing SGPR copies, instructions // resulting in the copies from SCC to these instructions will be // moved to the VALU. -def : Pat < +def : GCNPat < (i1 (and i1:$src0, i1:$src1)), (S_AND_B64 $src0, $src1) >; -def : Pat < +def : GCNPat < (i1 (or i1:$src0, i1:$src1)), (S_OR_B64 $src0, $src1) >; -def : Pat < +def : GCNPat < (i1 (xor i1:$src0, i1:$src1)), (S_XOR_B64 $src0, $src1) >; -def : Pat < +def : GCNPat < (f32 (sint_to_fp i1:$src)), (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src) >; -def : Pat < +def : GCNPat < (f32 (uint_to_fp i1:$src)), (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src) >; -def : Pat < +def : GCNPat < (f64 (sint_to_fp i1:$src)), (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)) >; -def : Pat < +def : GCNPat < (f64 (uint_to_fp i1:$src)), (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)) >; @@ -1112,79 +1285,95 @@ def : Pat < //===----------------------------------------------------------------------===// // Miscellaneous Patterns //===----------------------------------------------------------------------===// -def : Pat < +def : GCNPat < (i32 (AMDGPUfp16_zext f16:$src)), (COPY $src) >; -def : Pat < +def : GCNPat < (i32 (trunc i64:$a)), (EXTRACT_SUBREG $a, sub0) >; -def : Pat < +def : GCNPat < (i1 (trunc i32:$a)), (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1)) >; -def : Pat < +def : GCNPat < (i1 (trunc i16:$a)), (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1)) >; -def : Pat < +def : GCNPat < (i1 (trunc i64:$a)), (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) >; -def : Pat < +def : GCNPat < (i32 (bswap i32:$a)), (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)), (V_ALIGNBIT_B32 $a, $a, (i32 24)), (V_ALIGNBIT_B32 $a, $a, (i32 8))) >; -multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> { - def : Pat < - (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), - (BFM $a, $b) - >; +let OtherPredicates = [NoFP16Denormals] in { +def : GCNPat< + (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), + (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src, 0, 0) +>; - def : Pat < - (vt (add (vt (shl 1, vt:$a)), -1)), - (BFM $a, (MOV (i32 0))) - >; +def : GCNPat< + (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), + (V_PK_MUL_F16 0, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) +>; } -defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>; -// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>; -defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>; - -def : Pat< +let OtherPredicates = [FP16Denormals] in { +def : GCNPat< (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), - (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src, 0, 0) + (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0) >; -def : Pat< +def : GCNPat< + (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), + (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE) +>; +} + +let OtherPredicates = [NoFP32Denormals] in { +def : GCNPat< (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src, 0, 0) >; +} + +let OtherPredicates = [FP32Denormals] in { +def : GCNPat< + (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), + (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src, 0, 0) +>; +} -def : Pat< +let OtherPredicates = [NoFP64Denormals] in { +def : GCNPat< (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src, 0, 0) >; +} -def : Pat< - (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), - (V_PK_MUL_F16 SRCMODS.OP_SEL_1, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) +let OtherPredicates = [FP64Denormals] in { +def : GCNPat< + (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), + (V_MAX_F64 $src_mods, $src, $src_mods, $src, 0, 0) >; +} // Allow integer inputs -class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : Pat< +class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat< (node (i8 timm:$tgt), (i8 timm:$en), vt:$src0, vt:$src1, vt:$src2, vt:$src3, (i1 timm:$compr), (i1 timm:$vm)), (Inst i8:$tgt, vt:$src0, vt:$src1, vt:$src2, vt:$src3, i1:$vm, i1:$compr, i8:$en) >; @@ -1192,36 +1381,43 @@ class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : Pat< def : ExpPattern<AMDGPUexport, i32, EXP>; def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>; -def : Pat < +def : GCNPat < (v2i16 (build_vector i16:$src0, i16:$src1)), (v2i16 (S_PACK_LL_B32_B16 $src0, $src1)) >; +// COPY_TO_REGCLASS is workaround tablegen bug from multiple outputs +// from S_LSHL_B32's multiple outputs from implicit scc def. +def : GCNPat < + (v2i16 (build_vector (i16 0), i16:$src1)), + (v2i16 (COPY_TO_REGCLASS (S_LSHL_B32 i16:$src1, (i16 16)), SReg_32_XM0)) +>; + // With multiple uses of the shift, this will duplicate the shift and // increase register pressure. -def : Pat < +def : GCNPat < (v2i16 (build_vector i16:$src0, (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))), (v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1)) >; -def : Pat < +def : GCNPat < (v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))), (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))), (v2i16 (S_PACK_HH_B32_B16 $src0, $src1)) >; // TODO: Should source modifiers be matched to v_pack_b32_f16? -def : Pat < +def : GCNPat < (v2f16 (build_vector f16:$src0, f16:$src1)), (v2f16 (S_PACK_LL_B32_B16 $src0, $src1)) >; -// def : Pat < +// def : GCNPat < // (v2f16 (scalar_to_vector f16:$src0)), // (COPY $src0) // >; -// def : Pat < +// def : GCNPat < // (v2i16 (scalar_to_vector i16:$src0)), // (COPY $src0) // >; @@ -1230,7 +1426,7 @@ def : Pat < // Fract Patterns //===----------------------------------------------------------------------===// -let Predicates = [isSI] in { +let SubtargetPredicate = isSI in { // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is // used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient @@ -1239,7 +1435,7 @@ let Predicates = [isSI] in { // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) // Convert floor(x) to (x - fract(x)) -def : Pat < +def : GCNPat < (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), (V_ADD_F64 $mods, @@ -1257,7 +1453,7 @@ def : Pat < DSTCLAMP.NONE, DSTOMOD.NONE) >; -} // End Predicates = [isSI] +} // End SubtargetPredicates = isSI //============================================================================// // Miscellaneous Optimization Patterns @@ -1266,20 +1462,41 @@ def : Pat < // Undo sub x, c -> add x, -c canonicalization since c is more likely // an inline immediate than -c. // TODO: Also do for 64-bit. -def : Pat< +def : GCNPat< (add i32:$src0, (i32 NegSubInlineConst32:$src1)), (S_SUB_I32 $src0, NegSubInlineConst32:$src1) >; + +multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> { + def : GCNPat < + (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), + (BFM $a, $b) + >; + + def : GCNPat < + (vt (add (vt (shl 1, vt:$a)), -1)), + (BFM $a, (MOV (i32 0))) + >; +} + +let SubtargetPredicate = isGCN in { + +defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>; +// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>; + +defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>; def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>; def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>; def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>; +} + // This matches 16 permutations of // max(min(x, y), min(max(x, y), z)) class FPMed3Pat<ValueType vt, - Instruction med3Inst> : Pat< + Instruction med3Inst> : GCNPat< (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), @@ -1288,20 +1505,30 @@ class FPMed3Pat<ValueType vt, (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) >; -def : FPMed3Pat<f32, V_MED3_F32>; - -let Predicates = [isGFX9] in { -def : FPMed3Pat<f16, V_MED3_F16>; -def : IntMed3Pat<V_MED3_I16, smax, smax_oneuse, smin_oneuse, i16>; -def : IntMed3Pat<V_MED3_U16, umax, umax_oneuse, umin_oneuse, i16>; -} // End Predicates = [isGFX9] +class FP16Med3Pat<ValueType vt, + Instruction med3Inst> : GCNPat< + (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), + (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), + (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), + (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), + (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))), + (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE) +>; -//============================================================================// -// Assembler aliases -//============================================================================// +class Int16Med3Pat<Instruction med3Inst, + SDPatternOperator max, + SDPatternOperator max_oneuse, + SDPatternOperator min_oneuse, + ValueType vt = i32> : GCNPat< + (max (min_oneuse vt:$src0, vt:$src1), + (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)), + (med3Inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE) +>; -def : MnemonicAlias<"v_add_u32", "v_add_i32">; -def : MnemonicAlias<"v_sub_u32", "v_sub_i32">; -def : MnemonicAlias<"v_subrev_u32", "v_subrev_i32">; +def : FPMed3Pat<f32, V_MED3_F32>; -} // End isGCN predicate +let OtherPredicates = [isGFX9] in { +def : FP16Med3Pat<f16, V_MED3_F16>; +def : Int16Med3Pat<V_MED3_I16, smax, smax_oneuse, smin_oneuse, i16>; +def : Int16Med3Pat<V_MED3_U16, umax, umax_oneuse, umin_oneuse, i16>; +} // End Predicates = [isGFX9] diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index c6ad61a325cc..84cd47a101a8 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -1,4 +1,4 @@ -//===-- SILoadStoreOptimizer.cpp ------------------------------------------===// +//===- SILoadStoreOptimizer.cpp -------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -14,6 +14,12 @@ // ==> // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 // +// The same is done for certain SMEM and VMEM opcodes, e.g.: +// s_buffer_load_dword s4, s[0:3], 4 +// s_buffer_load_dword s5, s[0:3], 8 +// ==> +// s_buffer_load_dwordx2 s[4:5], s[0:3], 4 +// // // Future improvements: // @@ -56,8 +62,9 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" +#include <algorithm> #include <cassert> +#include <cstdlib> #include <iterator> #include <utility> @@ -68,31 +75,56 @@ using namespace llvm; namespace { class SILoadStoreOptimizer : public MachineFunctionPass { - - typedef struct { + enum InstClassEnum { + DS_READ_WRITE, + S_BUFFER_LOAD_IMM, + BUFFER_LOAD_OFFEN, + BUFFER_LOAD_OFFSET, + BUFFER_STORE_OFFEN, + BUFFER_STORE_OFFSET, + }; + + struct CombineInfo { MachineBasicBlock::iterator I; MachineBasicBlock::iterator Paired; unsigned EltSize; unsigned Offset0; unsigned Offset1; unsigned BaseOff; + InstClassEnum InstClass; + bool GLC0; + bool GLC1; + bool SLC0; + bool SLC1; bool UseST64; + bool IsX2; SmallVector<MachineInstr*, 8> InstsToMove; - } CombineInfo; + }; private: + const SISubtarget *STM = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; MachineRegisterInfo *MRI = nullptr; AliasAnalysis *AA = nullptr; + unsigned CreatedX2; static bool offsetsCanBeCombined(CombineInfo &CI); - bool findMatchingDSInst(CombineInfo &CI); + bool findMatchingInst(CombineInfo &CI); + unsigned read2Opcode(unsigned EltSize) const; + unsigned read2ST64Opcode(unsigned EltSize) const; MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI); + unsigned write2Opcode(unsigned EltSize) const; + unsigned write2ST64Opcode(unsigned EltSize) const; MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI); + MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI); + MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI); + unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2, + bool &IsOffen) const; + MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI); public: static char ID; @@ -141,36 +173,35 @@ static void moveInstsAfter(MachineBasicBlock::iterator I, } } -static void addDefsToList(const MachineInstr &MI, - SmallVectorImpl<const MachineOperand *> &Defs) { - for (const MachineOperand &Def : MI.defs()) { - Defs.push_back(&Def); - } +static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) { + // XXX: Should this be looking for implicit defs? + for (const MachineOperand &Def : MI.defs()) + Defs.insert(Def.getReg()); } static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, const SIInstrInfo *TII, AliasAnalysis * AA) { - return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) || - // RAW or WAR - cannot reorder - // WAW - cannot reorder - // RAR - safe to reorder - !(A->mayStore() || B->mayStore())); + // RAW or WAR - cannot reorder + // WAW - cannot reorder + // RAR - safe to reorder + return !(A->mayStore() || B->mayStore()) || + TII->areMemAccessesTriviallyDisjoint(*A, *B, AA); } // Add MI and its defs to the lists if MI reads one of the defs that are // already in the list. Returns true in that case. static bool addToListsIfDependent(MachineInstr &MI, - SmallVectorImpl<const MachineOperand *> &Defs, + DenseSet<unsigned> &Defs, SmallVectorImpl<MachineInstr*> &Insts) { - for (const MachineOperand *Def : Defs) { - bool ReadDef = MI.readsVirtualRegister(Def->getReg()); - // If ReadDef is true, then there is a use of Def between I - // and the instruction that I will potentially be merged with. We - // will need to move this instruction after the merged instructions. - if (ReadDef) { + for (MachineOperand &Use : MI.operands()) { + // If one of the defs is read, then there is a use of Def between I and the + // instruction that I will potentially be merged with. We will need to move + // this instruction after the merged instructions. + + if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) { Insts.push_back(&MI); addDefsToList(MI, Defs); return true; @@ -211,6 +242,15 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { CI.UseST64 = false; CI.BaseOff = 0; + // Handle SMEM and VMEM instructions. + if (CI.InstClass != DS_READ_WRITE) { + unsigned Diff = CI.IsX2 ? 2 : 1; + return (EltOffset0 + Diff == EltOffset1 || + EltOffset1 + Diff == EltOffset0) && + CI.GLC0 == CI.GLC1 && + (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1); + } + // If the offset in elements doesn't fit in 8-bits, we might be able to use // the stride 64 versions. if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && @@ -248,30 +288,70 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { return false; } -bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) { - MachineBasicBlock::iterator E = CI.I->getParent()->end(); +bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { + MachineBasicBlock *MBB = CI.I->getParent(); + MachineBasicBlock::iterator E = MBB->end(); MachineBasicBlock::iterator MBBI = CI.I; + + unsigned AddrOpName[3] = {0}; + int AddrIdx[3]; + const MachineOperand *AddrReg[3]; + unsigned NumAddresses = 0; + + switch (CI.InstClass) { + case DS_READ_WRITE: + AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; + break; + case S_BUFFER_LOAD_IMM: + AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; + break; + case BUFFER_LOAD_OFFEN: + case BUFFER_STORE_OFFEN: + AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; + AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; + AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; + break; + case BUFFER_LOAD_OFFSET: + case BUFFER_STORE_OFFSET: + AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; + AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; + break; + } + + for (unsigned i = 0; i < NumAddresses; i++) { + AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]); + AddrReg[i] = &CI.I->getOperand(AddrIdx[i]); + + // We only ever merge operations with the same base address register, so don't + // bother scanning forward if there are no other uses. + if (AddrReg[i]->isReg() && + (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) || + MRI->hasOneNonDBGUse(AddrReg[i]->getReg()))) + return false; + } + ++MBBI; - SmallVector<const MachineOperand *, 8> DefsToMove; + DenseSet<unsigned> DefsToMove; addDefsToList(*CI.I, DefsToMove); for ( ; MBBI != E; ++MBBI) { if (MBBI->getOpcode() != CI.I->getOpcode()) { - // This is not a matching DS instruction, but we can keep looking as // long as one of these conditions are met: // 1. It is safe to move I down past MBBI. // 2. It is safe to move MBBI down past the instruction that I will // be merged into. - if (MBBI->hasUnmodeledSideEffects()) + if (MBBI->hasUnmodeledSideEffects()) { // We can't re-order this instruction with respect to other memory - // opeations, so we fail both conditions mentioned above. + // operations, so we fail both conditions mentioned above. return false; + } if (MBBI->mayLoadOrStore() && - !memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA)) { + (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) || + !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) { // We fail condition #1, but we may still be able to satisfy condition // #2. Add this instruction to the move list and then we will check // if condition #2 holds once we have selected the matching instruction. @@ -300,21 +380,47 @@ bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) { if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove)) continue; - int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), - AMDGPU::OpName::addr); - const MachineOperand &AddrReg0 = CI.I->getOperand(AddrIdx); - const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); + bool Match = true; + for (unsigned i = 0; i < NumAddresses; i++) { + const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]); + + if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { + if (AddrReg[i]->isImm() != AddrRegNext.isImm() || + AddrReg[i]->getImm() != AddrRegNext.getImm()) { + Match = false; + break; + } + continue; + } + + // Check same base pointer. Be careful of subregisters, which can occur with + // vectors of pointers. + if (AddrReg[i]->getReg() != AddrRegNext.getReg() || + AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { + Match = false; + break; + } + } - // Check same base pointer. Be careful of subregisters, which can occur with - // vectors of pointers. - if (AddrReg0.getReg() == AddrReg1.getReg() && - AddrReg0.getSubReg() == AddrReg1.getSubReg()) { + if (Match) { int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset); - CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm() & 0xffff; - CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; + CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm(); + CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm(); CI.Paired = MBBI; + if (CI.InstClass == DS_READ_WRITE) { + CI.Offset0 &= 0xffff; + CI.Offset1 &= 0xffff; + } else { + CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm(); + CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm(); + if (CI.InstClass != S_BUFFER_LOAD_IMM) { + CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm(); + CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm(); + } + } + // Check both offsets fit in the reduced range. // We also need to go through the list of instructions that we plan to // move and make sure they are all safe to move down past the merged @@ -336,6 +442,20 @@ bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) { return false; } +unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { + if (STM->ldsRequiresM0Init()) + return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; + return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; +} + +unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { + if (STM->ldsRequiresM0Init()) + return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; + + return (EltSize == 4) ? + AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9; +} + MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); @@ -349,12 +469,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( unsigned NewOffset0 = CI.Offset0; unsigned NewOffset1 = CI.Offset1; - unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2_B32 - : AMDGPU::DS_READ2_B64; - - if (CI.UseST64) - Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 - : AMDGPU::DS_READ2ST64_B64; + unsigned Opc = CI.UseST64 ? + read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; @@ -382,9 +498,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( if (CI.BaseOff) { BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; - BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg) - .addImm(CI.BaseOff) - .addReg(AddrReg->getReg()); + + unsigned AddOpc = STM->hasAddNoCarry() ? + AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32; + BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg) + .addImm(CI.BaseOff) + .addReg(AddrReg->getReg()); } MachineInstrBuilder Read2 = @@ -417,6 +536,20 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( return Next; } +unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { + if (STM->ldsRequiresM0Init()) + return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; + return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9; +} + +unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { + if (STM->ldsRequiresM0Init()) + return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; + + return (EltSize == 4) ? + AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9; +} + MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( CombineInfo &CI) { MachineBasicBlock *MBB = CI.I->getParent(); @@ -430,12 +563,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( unsigned NewOffset0 = CI.Offset0; unsigned NewOffset1 = CI.Offset1; - unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2_B32 - : AMDGPU::DS_WRITE2_B64; - - if (CI.UseST64) - Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 - : AMDGPU::DS_WRITE2ST64_B64; + unsigned Opc = CI.UseST64 ? + write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); if (NewOffset0 > NewOffset1) { // Canonicalize the merged instruction so the smaller offset comes first. @@ -455,9 +584,12 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( if (CI.BaseOff) { BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); BaseRegFlags = RegState::Kill; - BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg) - .addImm(CI.BaseOff) - .addReg(Addr->getReg()); + + unsigned AddOpc = STM->hasAddNoCarry() ? + AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32; + BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg) + .addImm(CI.BaseOff) + .addReg(Addr->getReg()); } MachineInstrBuilder Write2 = @@ -480,6 +612,194 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( return Next; } +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( + CombineInfo &CI) { + MachineBasicBlock *MBB = CI.I->getParent(); + DebugLoc DL = CI.I->getDebugLoc(); + unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM : + AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + + const TargetRegisterClass *SuperRC = + CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass; + unsigned DestReg = MRI->createVirtualRegister(SuperRC); + unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); + + BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) + .addImm(MergedOffset) // offset + .addImm(CI.GLC0) // glc + .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + + unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; + unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; + + // Handle descending offsets + if (CI.Offset0 > CI.Offset1) + std::swap(SubRegIdx0, SubRegIdx1); + + // Copy to the old destination registers. + const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); + const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); + const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst); + + BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest0) // Copy to same destination including flags and sub reg. + .addReg(DestReg, 0, SubRegIdx0); + MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); + + moveInstsAfter(Copy1, CI.InstsToMove); + + MachineBasicBlock::iterator Next = std::next(CI.I); + CI.I->eraseFromParent(); + CI.Paired->eraseFromParent(); + return Next; +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( + CombineInfo &CI) { + MachineBasicBlock *MBB = CI.I->getParent(); + DebugLoc DL = CI.I->getDebugLoc(); + unsigned Opcode; + + if (CI.InstClass == BUFFER_LOAD_OFFEN) { + Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN : + AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN; + } else { + Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET : + AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; + } + + const TargetRegisterClass *SuperRC = + CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; + unsigned DestReg = MRI->createVirtualRegister(SuperRC); + unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); + + auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg); + + if (CI.InstClass == BUFFER_LOAD_OFFEN) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(MergedOffset) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + + unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; + unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; + + // Handle descending offsets + if (CI.Offset0 > CI.Offset1) + std::swap(SubRegIdx0, SubRegIdx1); + + // Copy to the old destination registers. + const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); + const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); + const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); + + BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest0) // Copy to same destination including flags and sub reg. + .addReg(DestReg, 0, SubRegIdx0); + MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc) + .add(*Dest1) + .addReg(DestReg, RegState::Kill, SubRegIdx1); + + moveInstsAfter(Copy1, CI.InstsToMove); + + MachineBasicBlock::iterator Next = std::next(CI.I); + CI.I->eraseFromParent(); + CI.Paired->eraseFromParent(); + return Next; +} + +unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode( + const MachineInstr &I, bool &IsX2, bool &IsOffen) const { + IsX2 = false; + IsOffen = false; + + switch (I.getOpcode()) { + case AMDGPU::BUFFER_STORE_DWORD_OFFEN: + IsOffen = true; + return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN; + case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: + IsOffen = true; + return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact; + case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN: + IsX2 = true; + IsOffen = true; + return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN; + case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact: + IsX2 = true; + IsOffen = true; + return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact; + case AMDGPU::BUFFER_STORE_DWORD_OFFSET: + return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; + case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: + return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact; + case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET: + IsX2 = true; + return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; + case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact: + IsX2 = true; + return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact; + } + return 0; +} + +MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( + CombineInfo &CI) { + MachineBasicBlock *MBB = CI.I->getParent(); + DebugLoc DL = CI.I->getDebugLoc(); + bool Unused1, Unused2; + unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2); + + unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; + unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1; + + // Handle descending offsets + if (CI.Offset0 > CI.Offset1) + std::swap(SubRegIdx0, SubRegIdx1); + + // Copy to the new source register. + const TargetRegisterClass *SuperRC = + CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass; + unsigned SrcReg = MRI->createVirtualRegister(SuperRC); + + const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); + const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata); + + BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) + .add(*Src0) + .addImm(SubRegIdx0) + .add(*Src1) + .addImm(SubRegIdx1); + + auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode)) + .addReg(SrcReg, RegState::Kill); + + if (CI.InstClass == BUFFER_STORE_OFFEN) + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(std::min(CI.Offset0, CI.Offset1)) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired)); + + moveInstsAfter(MIB, CI.InstsToMove); + + MachineBasicBlock::iterator Next = std::next(CI.I); + CI.I->eraseFromParent(); + CI.Paired->eraseFromParent(); + return Next; +} + // Scan through looking for adjacent LDS operations with constant offsets from // the same base register. We rely on the scheduler to do the hard work of // clustering nearby loads, and assume these are all adjacent. @@ -498,9 +818,14 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { CombineInfo CI; CI.I = I; unsigned Opc = MI.getOpcode(); - if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { - CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; - if (findMatchingDSInst(CI)) { + if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 || + Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) { + + CI.InstClass = DS_READ_WRITE; + CI.EltSize = + (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4; + + if (findMatchingInst(CI)) { Modified = true; I = mergeRead2Pair(CI); } else { @@ -508,9 +833,14 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { } continue; - } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { - CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; - if (findMatchingDSInst(CI)) { + } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 || + Opc == AMDGPU::DS_WRITE_B32_gfx9 || + Opc == AMDGPU::DS_WRITE_B64_gfx9) { + CI.InstClass = DS_READ_WRITE; + CI.EltSize + = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4; + + if (findMatchingInst(CI)) { Modified = true; I = mergeWrite2Pair(CI); } else { @@ -519,6 +849,62 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { continue; } + if (STM->hasSBufferLoadStoreAtomicDwordxN() && + (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || + Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) { + // EltSize is in units of the offset encoding. + CI.InstClass = S_BUFFER_LOAD_IMM; + CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4); + CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; + if (findMatchingInst(CI)) { + Modified = true; + I = mergeSBufferLoadImmPair(CI); + if (!CI.IsX2) + CreatedX2++; + } else { + ++I; + } + continue; + } + if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || + Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || + Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET || + Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) { + if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN || + Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN) + CI.InstClass = BUFFER_LOAD_OFFEN; + else + CI.InstClass = BUFFER_LOAD_OFFSET; + + CI.EltSize = 4; + CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN || + Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; + if (findMatchingInst(CI)) { + Modified = true; + I = mergeBufferLoadPair(CI); + if (!CI.IsX2) + CreatedX2++; + } else { + ++I; + } + continue; + } + + bool StoreIsX2, IsOffen; + if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) { + CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET; + CI.EltSize = 4; + CI.IsX2 = StoreIsX2; + if (findMatchingInst(CI)) { + Modified = true; + I = mergeBufferStorePair(CI); + if (!CI.IsX2) + CreatedX2++; + } else { + ++I; + } + continue; + } ++I; } @@ -527,25 +913,33 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { } bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; - const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); - if (!STM.loadStoreOptEnabled()) + STM = &MF.getSubtarget<SISubtarget>(); + if (!STM->loadStoreOptEnabled()) return false; - TII = STM.getInstrInfo(); + TII = STM->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + assert(MRI->isSSA() && "Must be run on SSA"); + DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); bool Modified = false; - for (MachineBasicBlock &MBB : MF) + for (MachineBasicBlock &MBB : MF) { + CreatedX2 = 0; Modified |= optimizeBlock(MBB); + // Run again to convert x2 to x4. + if (CreatedX2 >= 1) + Modified |= optimizeBlock(MBB); + } + return Modified; } diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp index 5f1c7f1fc42f..a9af83323976 100644 --- a/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -21,31 +21,31 @@ /// EXEC to update the predicates. /// /// For example: -/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 -/// %SGPR0 = SI_IF %VCC -/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 -/// %SGPR0 = SI_ELSE %SGPR0 -/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 -/// SI_END_CF %SGPR0 +/// %vcc = V_CMP_GT_F32 %vgpr1, %vgpr2 +/// %sgpr0 = SI_IF %vcc +/// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 +/// %sgpr0 = SI_ELSE %sgpr0 +/// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr0 +/// SI_END_CF %sgpr0 /// /// becomes: /// -/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask -/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask +/// %sgpr0 = S_AND_SAVEEXEC_B64 %vcc // Save and update the exec mask +/// %sgpr0 = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask /// S_CBRANCH_EXECZ label0 // This instruction is an optional /// // optimization which allows us to /// // branch if all the bits of /// // EXEC are zero. -/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch +/// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch /// /// label0: -/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block -/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask +/// %sgpr0 = S_OR_SAVEEXEC_B64 %exec // Restore the exec mask for the Then block +/// %exec = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask /// S_BRANCH_EXECZ label1 // Use our branch optimization /// // instruction again. -/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block +/// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr // Do the THEN block /// label1: -/// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits +/// %exec = S_OR_B64 %exec, %sgpr0 // Re-enable saved exec mask bits //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -53,7 +53,7 @@ #include "SIInstrInfo.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -63,9 +63,9 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Pass.h" -#include "llvm/Target/TargetRegisterInfo.h" #include <cassert> #include <iterator> @@ -134,6 +134,39 @@ static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) { char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; +static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI, + const SIInstrInfo *TII) { + unsigned SaveExecReg = MI.getOperand(0).getReg(); + auto U = MRI->use_instr_nodbg_begin(SaveExecReg); + + if (U == MRI->use_instr_nodbg_end() || + std::next(U) != MRI->use_instr_nodbg_end() || + U->getOpcode() != AMDGPU::SI_END_CF) + return false; + + // Check for SI_KILL_*_TERMINATOR on path from if to endif. + // if there is any such terminator simplififcations are not safe. + auto SMBB = MI.getParent(); + auto EMBB = U->getParent(); + DenseSet<const MachineBasicBlock*> Visited; + SmallVector<MachineBasicBlock*, 4> Worklist(SMBB->succ_begin(), + SMBB->succ_end()); + + while (!Worklist.empty()) { + MachineBasicBlock *MBB = Worklist.pop_back_val(); + + if (MBB == EMBB || !Visited.insert(MBB).second) + continue; + for(auto &Term : MBB->terminators()) + if (TII->isKillTerminator(Term.getOpcode())) + return false; + + Worklist.append(MBB->succ_begin(), MBB->succ_end()); + } + + return true; +} + void SILowerControlFlow::emitIf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); @@ -149,9 +182,15 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { MachineOperand &ImpDefSCC = MI.getOperand(4); assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); + // If there is only one use of save exec register and that use is SI_END_CF, + // we can optimize SI_IF by returning the full saved exec mask instead of + // just cleared bits. + bool SimpleIf = isSimpleIf(MI, MRI, TII); + // Add an implicit def of exec to discourage scheduling VALU after this which // will interfere with trying to form s_and_saveexec_b64 later. - unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned CopyReg = SimpleIf ? SaveExecReg + : MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); MachineInstr *CopyExec = BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg) .addReg(AMDGPU::EXEC) @@ -166,11 +205,14 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { .addReg(Cond.getReg()); setImpSCCDefDead(*And, true); - MachineInstr *Xor = - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg) - .addReg(Tmp) - .addReg(CopyReg); - setImpSCCDefDead(*Xor, ImpDefSCC.isDead()); + MachineInstr *Xor = nullptr; + if (!SimpleIf) { + Xor = + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg) + .addReg(Tmp) + .addReg(CopyReg); + setImpSCCDefDead(*Xor, ImpDefSCC.isDead()); + } // Use a copy that is a terminator to get correct spill code placement it with // fast regalloc. @@ -194,7 +236,8 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { // register. LIS->ReplaceMachineInstrInMaps(MI, *And); - LIS->InsertMachineInstrInMaps(*Xor); + if (!SimpleIf) + LIS->InsertMachineInstrInMaps(*Xor); LIS->InsertMachineInstrInMaps(*SetExec); LIS->InsertMachineInstrInMaps(*NewBr); @@ -207,7 +250,8 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { LIS->removeInterval(SaveExecReg); LIS->createAndComputeVirtRegInterval(SaveExecReg); LIS->createAndComputeVirtRegInterval(Tmp); - LIS->createAndComputeVirtRegInterval(CopyReg); + if (!SimpleIf) + LIS->createAndComputeVirtRegInterval(CopyReg); } void SILowerControlFlow::emitElse(MachineInstr &MI) { diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp index ba616ada0c9c..da57b90dd8c4 100644 --- a/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -17,7 +17,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -121,11 +121,14 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { } } + unsigned int TmpSrc = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::COPY), TmpSrc) + .add(Src); BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64)) .add(Dst) .addImm(0) .addImm(-1) - .add(Src); + .addReg(TmpSrc); MI.eraseFromParent(); } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) && SrcRC == &AMDGPU::VReg_1RegClass) { diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index a7c8166ff6d2..6013ebc81d9f 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -1,4 +1,4 @@ -//===-- SIMachineFunctionInfo.cpp -------- SI Machine Function Info -------===// +//===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===// // // The LLVM Compiler Infrastructure // @@ -8,13 +8,19 @@ //===----------------------------------------------------------------------===// #include "SIMachineFunctionInfo.h" +#include "AMDGPUArgumentUsageInfo.h" #include "AMDGPUSubtarget.h" -#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/Optional.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/CallingConv.h" #include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" +#include <cassert> +#include <vector> #define MAX_LANES 64 @@ -22,44 +28,8 @@ using namespace llvm; SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), - TIDReg(AMDGPU::NoRegister), - ScratchRSrcReg(AMDGPU::PRIVATE_RSRC_REG), - ScratchWaveOffsetReg(AMDGPU::SCRATCH_WAVE_OFFSET_REG), - FrameOffsetReg(AMDGPU::FP_REG), - StackPtrOffsetReg(AMDGPU::SP_REG), - PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister), - DispatchPtrUserSGPR(AMDGPU::NoRegister), - QueuePtrUserSGPR(AMDGPU::NoRegister), - KernargSegmentPtrUserSGPR(AMDGPU::NoRegister), - DispatchIDUserSGPR(AMDGPU::NoRegister), - FlatScratchInitUserSGPR(AMDGPU::NoRegister), - PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister), - GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister), - GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister), - GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister), - WorkGroupIDXSystemSGPR(AMDGPU::NoRegister), - WorkGroupIDYSystemSGPR(AMDGPU::NoRegister), - WorkGroupIDZSystemSGPR(AMDGPU::NoRegister), - WorkGroupInfoSystemSGPR(AMDGPU::NoRegister), - PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), - WorkItemIDXVGPR(AMDGPU::NoRegister), - WorkItemIDYVGPR(AMDGPU::NoRegister), - WorkItemIDZVGPR(AMDGPU::NoRegister), - PSInputAddr(0), - PSInputEnable(0), - ReturnsVoid(true), - FlatWorkGroupSizes(0, 0), - WavesPerEU(0, 0), - DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}), - DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}), - LDSWaveSpillSize(0), - NumUserSGPRs(0), - NumSystemSGPRs(0), - HasSpilledSGPRs(false), - HasSpilledVGPRs(false), - HasNonSpillStackObjects(false), - NumSpilledSGPRs(0), - NumSpilledVGPRs(0), + BufferPSV(*(MF.getSubtarget().getInstrInfo())), + ImagePSV(*(MF.getSubtarget().getInstrInfo())), PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), @@ -77,11 +47,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkItemIDX(false), WorkItemIDY(false), WorkItemIDZ(false), - ImplicitBufferPtr(false) { + ImplicitBufferPtr(false), + ImplicitArgPtr(false), + GITPtrHigh(0xffffffff) { const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - const Function *F = MF.getFunction(); - FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F); - WavesPerEU = ST.getWavesPerEU(*F); + const Function &F = MF.getFunction(); + FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); + WavesPerEU = ST.getWavesPerEU(F); if (!isEntryFunction()) { // Non-entry functions have no special inputs for now, other registers @@ -91,17 +63,26 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) FrameOffsetReg = AMDGPU::SGPR5; StackPtrOffsetReg = AMDGPU::SGPR32; - // FIXME: Not really a system SGPR. - PrivateSegmentWaveByteOffsetSystemSGPR = ScratchWaveOffsetReg; + ArgInfo.PrivateSegmentBuffer = + ArgDescriptor::createRegister(ScratchRSrcReg); + ArgInfo.PrivateSegmentWaveByteOffset = + ArgDescriptor::createRegister(ScratchWaveOffsetReg); + + if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) + ImplicitArgPtr = true; + } else { + if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) + KernargSegmentPtr = true; } - CallingConv::ID CC = F->getCallingConv(); + CallingConv::ID CC = F.getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { - KernargSegmentPtr = !F->arg_empty(); + if (!F.arg_empty()) + KernargSegmentPtr = true; WorkGroupIDX = true; WorkItemIDX = true; } else if (CC == CallingConv::AMDGPU_PS) { - PSInputAddr = AMDGPU::getInitialPSInputAddr(*F); + PSInputAddr = AMDGPU::getInitialPSInputAddr(F); } if (ST.debuggerEmitPrologue()) { @@ -113,27 +94,27 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkItemIDY = true; WorkItemIDZ = true; } else { - if (F->hasFnAttribute("amdgpu-work-group-id-x")) + if (F.hasFnAttribute("amdgpu-work-group-id-x")) WorkGroupIDX = true; - if (F->hasFnAttribute("amdgpu-work-group-id-y")) + if (F.hasFnAttribute("amdgpu-work-group-id-y")) WorkGroupIDY = true; - if (F->hasFnAttribute("amdgpu-work-group-id-z")) + if (F.hasFnAttribute("amdgpu-work-group-id-z")) WorkGroupIDZ = true; - if (F->hasFnAttribute("amdgpu-work-item-id-x")) + if (F.hasFnAttribute("amdgpu-work-item-id-x")) WorkItemIDX = true; - if (F->hasFnAttribute("amdgpu-work-item-id-y")) + if (F.hasFnAttribute("amdgpu-work-item-id-y")) WorkItemIDY = true; - if (F->hasFnAttribute("amdgpu-work-item-id-z")) + if (F.hasFnAttribute("amdgpu-work-item-id-z")) WorkItemIDZ = true; } const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - bool MaySpill = ST.isVGPRSpillingEnabled(*F); + bool MaySpill = ST.isVGPRSpillingEnabled(F); bool HasStackObjects = FrameInfo.hasStackObjects(); if (isEntryFunction()) { @@ -145,10 +126,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (HasStackObjects || MaySpill) { PrivateSegmentWaveByteOffset = true; - // HS and GS always have the scratch wave offset in SGPR5 on GFX9. - if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 && - (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS)) - PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5; + // HS and GS always have the scratch wave offset in SGPR5 on GFX9. + if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 && + (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS)) + ArgInfo.PrivateSegmentWaveByteOffset + = ArgDescriptor::createRegister(AMDGPU::SGPR5); } } @@ -157,78 +139,94 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (HasStackObjects || MaySpill) PrivateSegmentBuffer = true; - if (F->hasFnAttribute("amdgpu-dispatch-ptr")) + if (F.hasFnAttribute("amdgpu-dispatch-ptr")) DispatchPtr = true; - if (F->hasFnAttribute("amdgpu-queue-ptr")) + if (F.hasFnAttribute("amdgpu-queue-ptr")) QueuePtr = true; - if (F->hasFnAttribute("amdgpu-dispatch-id")) + if (F.hasFnAttribute("amdgpu-dispatch-id")) DispatchID = true; } else if (ST.isMesaGfxShader(MF)) { if (HasStackObjects || MaySpill) ImplicitBufferPtr = true; } - if (F->hasFnAttribute("amdgpu-kernarg-segment-ptr")) + if (F.hasFnAttribute("amdgpu-kernarg-segment-ptr")) KernargSegmentPtr = true; if (ST.hasFlatAddressSpace() && isEntryFunction() && IsCOV2) { // TODO: This could be refined a lot. The attribute is a poor way of // detecting calls that may require it before argument lowering. - if (HasStackObjects || F->hasFnAttribute("amdgpu-flat-scratch")) + if (HasStackObjects || F.hasFnAttribute("amdgpu-flat-scratch")) FlatScratchInit = true; } + + Attribute A = F.getFnAttribute("amdgpu-git-ptr-high"); + StringRef S = A.getValueAsString(); + if (!S.empty()) + S.consumeInteger(0, GITPtrHigh); } unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( const SIRegisterInfo &TRI) { - PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + ArgInfo.PrivateSegmentBuffer = + ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass)); NumUserSGPRs += 4; - return PrivateSegmentBufferUserSGPR; + return ArgInfo.PrivateSegmentBuffer.getRegister(); } unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) { - DispatchPtrUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; - return DispatchPtrUserSGPR; + return ArgInfo.DispatchPtr.getRegister(); } unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) { - QueuePtrUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; - return QueuePtrUserSGPR; + return ArgInfo.QueuePtr.getRegister(); } unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) { - KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + ArgInfo.KernargSegmentPtr + = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; - return KernargSegmentPtrUserSGPR; + return ArgInfo.KernargSegmentPtr.getRegister(); } unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) { - DispatchIDUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; - return DispatchIDUserSGPR; + return ArgInfo.DispatchID.getRegister(); } unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { - FlatScratchInitUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; - return FlatScratchInitUserSGPR; + return ArgInfo.FlatScratchInit.getRegister(); } unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) { - ImplicitBufferPtrUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; - return ImplicitBufferPtrUserSGPR; + return ArgInfo.ImplicitBufferPtr.getRegister(); +} + +static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) { + for (unsigned I = 0; CSRegs[I]; ++I) { + if (CSRegs[I] == Reg) + return true; + } + + return false; } /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI. @@ -252,6 +250,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, int NumLanes = Size / 4; + const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF); + // Make sure to handle the case where a wide SGPR spill may span between two // VGPRs. for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) { @@ -261,21 +261,28 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, if (VGPRIndex == 0) { LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); if (LaneVGPR == AMDGPU::NoRegister) { - // We have no VGPRs left for spilling SGPRs. Reset because we won't + // We have no VGPRs left for spilling SGPRs. Reset because we will not // partially spill the SGPR to VGPRs. SGPRToVGPRSpills.erase(FI); NumVGPRSpillLanes -= I; return false; } - SpillVGPRs.push_back(LaneVGPR); + Optional<int> CSRSpillFI; + if (FrameInfo.hasCalls() && CSRegs && isCalleeSavedReg(CSRegs, LaneVGPR)) { + // TODO: Should this be a CreateSpillStackObject? This is technically a + // weird CSR spill. + CSRSpillFI = FrameInfo.CreateStackObject(4, 4, false); + } + + SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI)); // Add this register as live-in to all blocks to avoid machine verifer // complaining about use of an undefined physical register. for (MachineBasicBlock &BB : MF) BB.addLiveIn(LaneVGPR); } else { - LaneVGPR = SpillVGPRs.back(); + LaneVGPR = SpillVGPRs.back().VGPR; } SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex)); diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 4c7f38a09a48..5dde72910ee3 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -1,4 +1,4 @@ -//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==// +//==- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface --*- C++ -*-==// // // The LLVM Compiler Infrastructure // @@ -14,23 +14,32 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H #define LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H +#include "AMDGPUArgumentUsageInfo.h" #include "AMDGPUMachineFunction.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/ErrorHandling.h" #include <array> #include <cassert> -#include <map> #include <utility> +#include <vector> namespace llvm { +class MachineFrameInfo; +class MachineFunction; +class TargetRegisterClass; + class AMDGPUImagePseudoSourceValue : public PseudoSourceValue { public: - explicit AMDGPUImagePseudoSourceValue() : - PseudoSourceValue(PseudoSourceValue::TargetCustom) { } + explicit AMDGPUImagePseudoSourceValue(const TargetInstrInfo &TII) : + PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) { } bool isConstant(const MachineFrameInfo *) const override { // This should probably be true for most images, but we will start by being @@ -44,7 +53,7 @@ public: return false; } - bool mayAlias(const MachineFrameInfo*) const override { + bool mayAlias(const MachineFrameInfo *) const override { // FIXME: If we ever change image intrinsics to accept fat pointers, then // this could be true for some cases. return false; @@ -53,8 +62,8 @@ public: class AMDGPUBufferPseudoSourceValue : public PseudoSourceValue { public: - explicit AMDGPUBufferPseudoSourceValue() : - PseudoSourceValue(PseudoSourceValue::TargetCustom) { } + explicit AMDGPUBufferPseudoSourceValue(const TargetInstrInfo &TII) : + PseudoSourceValue(PseudoSourceValue::TargetCustom, TII) { } bool isConstant(const MachineFrameInfo *) const override { // This should probably be true for most images, but we will start by being @@ -68,7 +77,7 @@ public: return false; } - bool mayAlias(const MachineFrameInfo*) const override { + bool mayAlias(const MachineFrameInfo *) const override { // FIXME: If we ever change image intrinsics to accept fat pointers, then // this could be true for some cases. return false; @@ -78,86 +87,68 @@ public: /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo final : public AMDGPUMachineFunction { - // FIXME: This should be removed and getPreloadedValue moved here. - friend class SIRegisterInfo; - - unsigned TIDReg; + unsigned TIDReg = AMDGPU::NoRegister; // Registers that may be reserved for spilling purposes. These may be the same // as the input registers. - unsigned ScratchRSrcReg; - unsigned ScratchWaveOffsetReg; + unsigned ScratchRSrcReg = AMDGPU::PRIVATE_RSRC_REG; + unsigned ScratchWaveOffsetReg = AMDGPU::SCRATCH_WAVE_OFFSET_REG; // This is the current function's incremented size from the kernel's scratch // wave offset register. For an entry function, this is exactly the same as // the ScratchWaveOffsetReg. - unsigned FrameOffsetReg; + unsigned FrameOffsetReg = AMDGPU::FP_REG; // Top of the stack SGPR offset derived from the ScratchWaveOffsetReg. - unsigned StackPtrOffsetReg; - - // Input registers for non-HSA ABI - unsigned ImplicitBufferPtrUserSGPR; - - // Input registers setup for the HSA ABI. - // User SGPRs in allocation order. - unsigned PrivateSegmentBufferUserSGPR; - unsigned DispatchPtrUserSGPR; - unsigned QueuePtrUserSGPR; - unsigned KernargSegmentPtrUserSGPR; - unsigned DispatchIDUserSGPR; - unsigned FlatScratchInitUserSGPR; - unsigned PrivateSegmentSizeUserSGPR; - unsigned GridWorkGroupCountXUserSGPR; - unsigned GridWorkGroupCountYUserSGPR; - unsigned GridWorkGroupCountZUserSGPR; - - // System SGPRs in allocation order. - unsigned WorkGroupIDXSystemSGPR; - unsigned WorkGroupIDYSystemSGPR; - unsigned WorkGroupIDZSystemSGPR; - unsigned WorkGroupInfoSystemSGPR; - unsigned PrivateSegmentWaveByteOffsetSystemSGPR; - - // VGPR inputs. These are always v0, v1 and v2 for entry functions. - unsigned WorkItemIDXVGPR; - unsigned WorkItemIDYVGPR; - unsigned WorkItemIDZVGPR; + unsigned StackPtrOffsetReg = AMDGPU::SP_REG; - // Graphics info. - unsigned PSInputAddr; - unsigned PSInputEnable; + AMDGPUFunctionArgInfo ArgInfo; - bool ReturnsVoid; + // Graphics info. + unsigned PSInputAddr = 0; + unsigned PSInputEnable = 0; + + /// Number of bytes of arguments this function has on the stack. If the callee + /// is expected to restore the argument stack this should be a multiple of 16, + /// all usable during a tail call. + /// + /// The alternative would forbid tail call optimisation in some cases: if we + /// want to transfer control from a function with 8-bytes of stack-argument + /// space to a function with 16-bytes then misalignment of this value would + /// make a stack adjustment necessary, which could not be undone by the + /// callee. + unsigned BytesInStackArgArea = 0; + + bool ReturnsVoid = true; // A pair of default/requested minimum/maximum flat work group sizes. // Minimum - first, maximum - second. - std::pair<unsigned, unsigned> FlatWorkGroupSizes; + std::pair<unsigned, unsigned> FlatWorkGroupSizes = {0, 0}; // A pair of default/requested minimum/maximum number of waves per execution // unit. Minimum - first, maximum - second. - std::pair<unsigned, unsigned> WavesPerEU; + std::pair<unsigned, unsigned> WavesPerEU = {0, 0}; // Stack object indices for work group IDs. - std::array<int, 3> DebuggerWorkGroupIDStackObjectIndices; + std::array<int, 3> DebuggerWorkGroupIDStackObjectIndices = {{0, 0, 0}}; + // Stack object indices for work item IDs. - std::array<int, 3> DebuggerWorkItemIDStackObjectIndices; + std::array<int, 3> DebuggerWorkItemIDStackObjectIndices = {{0, 0, 0}}; AMDGPUBufferPseudoSourceValue BufferPSV; AMDGPUImagePseudoSourceValue ImagePSV; private: - unsigned LDSWaveSpillSize; - unsigned ScratchOffsetReg; - unsigned NumUserSGPRs; - unsigned NumSystemSGPRs; + unsigned LDSWaveSpillSize = 0; + unsigned NumUserSGPRs = 0; + unsigned NumSystemSGPRs = 0; - bool HasSpilledSGPRs; - bool HasSpilledVGPRs; - bool HasNonSpillStackObjects; + bool HasSpilledSGPRs = false; + bool HasSpilledVGPRs = false; + bool HasNonSpillStackObjects = false; - unsigned NumSpilledSGPRs; - unsigned NumSpilledVGPRs; + unsigned NumSpilledSGPRs = 0; + unsigned NumSpilledVGPRs = 0; // Feature bits required for inputs passed in user SGPRs. bool PrivateSegmentBuffer : 1; @@ -186,6 +177,15 @@ private: // Other shaders indirect 64-bits at sgpr[0:1] bool ImplicitBufferPtr : 1; + // Pointer to where the ABI inserts special kernel arguments separate from the + // user arguments. This is an offset from the KernargSegmentPtr. + bool ImplicitArgPtr : 1; + + // The hard-wired high half of the address of the global information table + // for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since + // current hardware only allows a 16 bit value. + unsigned GITPtrHigh; + MCPhysReg getNextUserSGPR() const { assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); return AMDGPU::SGPR0 + NumUserSGPRs; @@ -201,24 +201,34 @@ public: int Lane = -1; SpilledReg() = default; - SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { } + SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) {} bool hasLane() { return Lane != -1;} bool hasReg() { return VGPR != AMDGPU::NoRegister;} }; + struct SGPRSpillVGPRCSR { + // VGPR used for SGPR spills + unsigned VGPR; + + // If the VGPR is a CSR, the stack slot used to save/restore it in the + // prolog/epilog. + Optional<int> FI; + + SGPRSpillVGPRCSR(unsigned V, Optional<int> F) : VGPR(V), FI(F) {} + }; + private: // SGPR->VGPR spilling support. - typedef std::pair<unsigned, unsigned> SpillRegMask; + using SpillRegMask = std::pair<unsigned, unsigned>; // Track VGPR + wave index for each subregister of the SGPR spilled to // frameindex key. DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills; unsigned NumVGPRSpillLanes = 0; - SmallVector<unsigned, 2> SpillVGPRs; + SmallVector<SGPRSpillVGPRCSR, 2> SpillVGPRs; public: - SIMachineFunctionInfo(const MachineFunction &MF); ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const { @@ -227,13 +237,25 @@ public: ArrayRef<SpilledReg>() : makeArrayRef(I->second); } + ArrayRef<SGPRSpillVGPRCSR> getSGPRSpillVGPRs() const { + return SpillVGPRs; + } + bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI); - bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }; - unsigned getTIDReg() const { return TIDReg; }; + bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; } + unsigned getTIDReg() const { return TIDReg; } void setTIDReg(unsigned Reg) { TIDReg = Reg; } + unsigned getBytesInStackArgArea() const { + return BytesInStackArgArea; + } + + void setBytesInStackArgArea(unsigned Bytes) { + BytesInStackArgArea = Bytes; + } + // Add user SGPRs. unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI); unsigned addDispatchPtr(const SIRegisterInfo &TRI); @@ -245,37 +267,51 @@ public: // Add system SGPRs. unsigned addWorkGroupIDX() { - WorkGroupIDXSystemSGPR = getNextSystemSGPR(); + ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; - return WorkGroupIDXSystemSGPR; + return ArgInfo.WorkGroupIDX.getRegister(); } unsigned addWorkGroupIDY() { - WorkGroupIDYSystemSGPR = getNextSystemSGPR(); + ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; - return WorkGroupIDYSystemSGPR; + return ArgInfo.WorkGroupIDY.getRegister(); } unsigned addWorkGroupIDZ() { - WorkGroupIDZSystemSGPR = getNextSystemSGPR(); + ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; - return WorkGroupIDZSystemSGPR; + return ArgInfo.WorkGroupIDZ.getRegister(); } unsigned addWorkGroupInfo() { - WorkGroupInfoSystemSGPR = getNextSystemSGPR(); + ArgInfo.WorkGroupInfo = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; - return WorkGroupInfoSystemSGPR; + return ArgInfo.WorkGroupInfo.getRegister(); + } + + // Add special VGPR inputs + void setWorkItemIDX(ArgDescriptor Arg) { + ArgInfo.WorkItemIDX = Arg; + } + + void setWorkItemIDY(ArgDescriptor Arg) { + ArgInfo.WorkItemIDY = Arg; + } + + void setWorkItemIDZ(ArgDescriptor Arg) { + ArgInfo.WorkItemIDZ = Arg; } unsigned addPrivateSegmentWaveByteOffset() { - PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR(); + ArgInfo.PrivateSegmentWaveByteOffset + = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; - return PrivateSegmentWaveByteOffsetSystemSGPR; + return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); } void setPrivateSegmentWaveByteOffset(unsigned Reg) { - PrivateSegmentWaveByteOffsetSystemSGPR = Reg; + ArgInfo.PrivateSegmentWaveByteOffset = ArgDescriptor::createRegister(Reg); } bool hasPrivateSegmentBuffer() const { @@ -346,10 +382,35 @@ public: return WorkItemIDZ; } + bool hasImplicitArgPtr() const { + return ImplicitArgPtr; + } + bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; } + AMDGPUFunctionArgInfo &getArgInfo() { + return ArgInfo; + } + + const AMDGPUFunctionArgInfo &getArgInfo() const { + return ArgInfo; + } + + std::pair<const ArgDescriptor *, const TargetRegisterClass *> + getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const { + return ArgInfo.getPreloadedValue(Value); + } + + unsigned getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const { + return ArgInfo.getPreloadedValue(Value).first->getRegister(); + } + + unsigned getGITPtrHigh() const { + return GITPtrHigh; + } + unsigned getNumUserSGPRs() const { return NumUserSGPRs; } @@ -359,7 +420,7 @@ public: } unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const { - return PrivateSegmentWaveByteOffsetSystemSGPR; + return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); } /// \brief Returns the physical register reserved for use as the resource @@ -401,11 +462,11 @@ public: } unsigned getQueuePtrUserSGPR() const { - return QueuePtrUserSGPR; + return ArgInfo.QueuePtr.getRegister(); } unsigned getImplicitBufferPtrUserSGPR() const { - return ImplicitBufferPtrUserSGPR; + return ArgInfo.ImplicitBufferPtr.getRegister(); } bool hasSpilledSGPRs() const { @@ -537,13 +598,13 @@ public: switch (Dim) { case 0: assert(hasWorkGroupIDX()); - return WorkGroupIDXSystemSGPR; + return ArgInfo.WorkGroupIDX.getRegister(); case 1: assert(hasWorkGroupIDY()); - return WorkGroupIDYSystemSGPR; + return ArgInfo.WorkGroupIDY.getRegister(); case 2: assert(hasWorkGroupIDZ()); - return WorkGroupIDZSystemSGPR; + return ArgInfo.WorkGroupIDZ.getRegister(); } llvm_unreachable("unexpected dimension"); } diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp index 34886c48f461..6b67b76652ed 100644 --- a/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -19,16 +19,16 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/LiveInterval.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetRegisterInfo.h" #include <algorithm> #include <cassert> #include <map> @@ -595,11 +595,11 @@ void SIScheduleBlock::printDebug(bool full) { << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n"; dbgs() << "LiveIns:\n"; for (unsigned Reg : LiveInRegs) - dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; + dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << "\nLiveOuts:\n"; for (unsigned Reg : LiveOutRegs) - dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; + dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; } dbgs() << "\nInstructions:\n"; @@ -1130,6 +1130,62 @@ void SIScheduleBlockCreator::regroupNoUserInstructions() { } } +void SIScheduleBlockCreator::colorExports() { + unsigned ExportColor = NextNonReservedID++; + SmallVector<unsigned, 8> ExpGroup; + + // Put all exports together in a block. + // The block will naturally end up being scheduled last, + // thus putting exports at the end of the schedule, which + // is better for performance. + // However we must ensure, for safety, the exports can be put + // together in the same block without any other instruction. + // This could happen, for example, when scheduling after regalloc + // if reloading a spilled register from memory using the same + // register than used in a previous export. + // If that happens, do not regroup the exports. + for (unsigned SUNum : DAG->TopDownIndex2SU) { + const SUnit &SU = DAG->SUnits[SUNum]; + if (SIInstrInfo::isEXP(*SU.getInstr())) { + // Check the EXP can be added to the group safely, + // ie without needing any other instruction. + // The EXP is allowed to depend on other EXP + // (they will be in the same group). + for (unsigned j : ExpGroup) { + bool HasSubGraph; + std::vector<int> SubGraph; + // By construction (topological order), if SU and + // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary + // in the parent graph of SU. +#ifndef NDEBUG + SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], + HasSubGraph); + assert(!HasSubGraph); +#endif + SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, + HasSubGraph); + if (!HasSubGraph) + continue; // No dependencies between each other + + // SubGraph contains all the instructions required + // between EXP SUnits[j] and EXP SU. + for (unsigned k : SubGraph) { + if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr())) + // Other instructions than EXP would be required in the group. + // Abort the groupping. + return; + } + } + + ExpGroup.push_back(SUNum); + } + } + + // The group can be formed. Give the color. + for (unsigned j : ExpGroup) + CurrentColoring[j] = ExportColor; +} + void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) { unsigned DAGSize = DAG->SUnits.size(); std::map<unsigned,unsigned> RealID; @@ -1159,6 +1215,7 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria regroupNoUserInstructions(); colorMergeConstantLoadsNextGroup(); colorMergeIfPossibleNextGroupOnlyForReserved(); + colorExports(); // Put SUs of same color into same block Node2CurrentBlock.resize(DAGSize, -1); @@ -1365,8 +1422,8 @@ void SIScheduleBlockCreator::fillStats() { else { unsigned Depth = 0; for (SIScheduleBlock *Pred : Block->getPreds()) { - if (Depth < Pred->Depth + 1) - Depth = Pred->Depth + 1; + if (Depth < Pred->Depth + Pred->getCost()) + Depth = Pred->Depth + Pred->getCost(); } Block->Depth = Depth; } @@ -1380,7 +1437,7 @@ void SIScheduleBlockCreator::fillStats() { else { unsigned Height = 0; for (const auto &Succ : Block->getSuccs()) - Height = std::min(Height, Succ.first->Height + 1); + Height = std::max(Height, Succ.first->Height + Succ.first->getCost()); Block->Height = Height; } } @@ -1578,7 +1635,7 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs) - dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' '; + dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n'; @@ -1993,9 +2050,9 @@ void SIScheduleDAGMI::schedule() placeDebugValues(); DEBUG({ - unsigned BBNum = begin()->getParent()->getNumber(); - dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n"; - dumpSchedule(); - dbgs() << '\n'; - }); + dbgs() << "*** Final schedule for " + << printMBBReference(*begin()->getParent()) << " ***\n"; + dumpSchedule(); + dbgs() << '\n'; + }); } diff --git a/lib/Target/AMDGPU/SIMachineScheduler.h b/lib/Target/AMDGPU/SIMachineScheduler.h index 122d0f67ca8c..d824e38504e6 100644 --- a/lib/Target/AMDGPU/SIMachineScheduler.h +++ b/lib/Target/AMDGPU/SIMachineScheduler.h @@ -302,6 +302,9 @@ private: // (we'd want these groups be at the end). void regroupNoUserInstructions(); + // Give Reserved color to export instructions + void colorExports(); + void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant); void topologicalSort(); diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp new file mode 100644 index 000000000000..c73fb10b7ea0 --- /dev/null +++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -0,0 +1,627 @@ +//===- SIMemoryLegalizer.cpp ----------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Memory legalizer - implements memory model. More information can be +/// found here: +/// http://llvm.org/docs/AMDGPUUsage.html#memory-model +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUMachineModuleInfo.h" +#include "AMDGPUSubtarget.h" +#include "SIDefines.h" +#include "SIInstrInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Pass.h" +#include "llvm/Support/AtomicOrdering.h" +#include <cassert> +#include <list> + +using namespace llvm; +using namespace llvm::AMDGPU; + +#define DEBUG_TYPE "si-memory-legalizer" +#define PASS_NAME "SI Memory Legalizer" + +namespace { + +class SIMemOpInfo final { +private: + SyncScope::ID SSID = SyncScope::System; + AtomicOrdering Ordering = AtomicOrdering::NotAtomic; + AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; + bool IsNonTemporal = false; + + SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering) + : SSID(SSID), Ordering(Ordering) {} + + SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering, + AtomicOrdering FailureOrdering, bool IsNonTemporal = false) + : SSID(SSID), Ordering(Ordering), FailureOrdering(FailureOrdering), + IsNonTemporal(IsNonTemporal) {} + + /// \returns Info constructed from \p MI, which has at least machine memory + /// operand. + static Optional<SIMemOpInfo> constructFromMIWithMMO( + const MachineBasicBlock::iterator &MI); + +public: + /// \returns Synchronization scope ID of the machine instruction used to + /// create this SIMemOpInfo. + SyncScope::ID getSSID() const { + return SSID; + } + /// \returns Ordering constraint of the machine instruction used to + /// create this SIMemOpInfo. + AtomicOrdering getOrdering() const { + return Ordering; + } + /// \returns Failure ordering constraint of the machine instruction used to + /// create this SIMemOpInfo. + AtomicOrdering getFailureOrdering() const { + return FailureOrdering; + } + /// \returns True if memory access of the machine instruction used to + /// create this SIMemOpInfo is non-temporal, false otherwise. + bool isNonTemporal() const { + return IsNonTemporal; + } + + /// \returns True if ordering constraint of the machine instruction used to + /// create this SIMemOpInfo is unordered or higher, false otherwise. + bool isAtomic() const { + return Ordering != AtomicOrdering::NotAtomic; + } + + /// \returns Load info if \p MI is a load operation, "None" otherwise. + static Optional<SIMemOpInfo> getLoadInfo( + const MachineBasicBlock::iterator &MI); + /// \returns Store info if \p MI is a store operation, "None" otherwise. + static Optional<SIMemOpInfo> getStoreInfo( + const MachineBasicBlock::iterator &MI); + /// \returns Atomic fence info if \p MI is an atomic fence operation, + /// "None" otherwise. + static Optional<SIMemOpInfo> getAtomicFenceInfo( + const MachineBasicBlock::iterator &MI); + /// \returns Atomic cmpxchg info if \p MI is an atomic cmpxchg operation, + /// "None" otherwise. + static Optional<SIMemOpInfo> getAtomicCmpxchgInfo( + const MachineBasicBlock::iterator &MI); + /// \returns Atomic rmw info if \p MI is an atomic rmw operation, + /// "None" otherwise. + static Optional<SIMemOpInfo> getAtomicRmwInfo( + const MachineBasicBlock::iterator &MI); + + /// \brief Reports unknown synchronization scope used in \p MI to LLVM + /// context. + static void reportUnknownSyncScope( + const MachineBasicBlock::iterator &MI); +}; + +class SIMemoryLegalizer final : public MachineFunctionPass { +private: + /// \brief Machine module info. + const AMDGPUMachineModuleInfo *MMI = nullptr; + + /// \brief Instruction info. + const SIInstrInfo *TII = nullptr; + + /// \brief Immediate for "vmcnt(0)". + unsigned Vmcnt0Immediate = 0; + + /// \brief Opcode for cache invalidation instruction (L1). + unsigned Wbinvl1Opcode = 0; + + /// \brief List of atomic pseudo instructions. + std::list<MachineBasicBlock::iterator> AtomicPseudoMIs; + + /// \brief Sets named bit (BitName) to "true" if present in \p MI. Returns + /// true if \p MI is modified, false otherwise. + template <uint16_t BitName> + bool enableNamedBit(const MachineBasicBlock::iterator &MI) const { + int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName); + if (BitIdx == -1) + return false; + + MachineOperand &Bit = MI->getOperand(BitIdx); + if (Bit.getImm() != 0) + return false; + + Bit.setImm(1); + return true; + } + + /// \brief Sets GLC bit to "true" if present in \p MI. Returns true if \p MI + /// is modified, false otherwise. + bool enableGLCBit(const MachineBasicBlock::iterator &MI) const { + return enableNamedBit<AMDGPU::OpName::glc>(MI); + } + + /// \brief Sets SLC bit to "true" if present in \p MI. Returns true if \p MI + /// is modified, false otherwise. + bool enableSLCBit(const MachineBasicBlock::iterator &MI) const { + return enableNamedBit<AMDGPU::OpName::slc>(MI); + } + + /// \brief Inserts "buffer_wbinvl1_vol" instruction \p Before or after \p MI. + /// Always returns true. + bool insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI, + bool Before = true) const; + /// \brief Inserts "s_waitcnt vmcnt(0)" instruction \p Before or after \p MI. + /// Always returns true. + bool insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI, + bool Before = true) const; + + /// \brief Removes all processed atomic pseudo instructions from the current + /// function. Returns true if current function is modified, false otherwise. + bool removeAtomicPseudoMIs(); + + /// \brief Expands load operation \p MI. Returns true if instructions are + /// added/deleted or \p MI is modified, false otherwise. + bool expandLoad(const SIMemOpInfo &MOI, + MachineBasicBlock::iterator &MI); + /// \brief Expands store operation \p MI. Returns true if instructions are + /// added/deleted or \p MI is modified, false otherwise. + bool expandStore(const SIMemOpInfo &MOI, + MachineBasicBlock::iterator &MI); + /// \brief Expands atomic fence operation \p MI. Returns true if + /// instructions are added/deleted or \p MI is modified, false otherwise. + bool expandAtomicFence(const SIMemOpInfo &MOI, + MachineBasicBlock::iterator &MI); + /// \brief Expands atomic cmpxchg operation \p MI. Returns true if + /// instructions are added/deleted or \p MI is modified, false otherwise. + bool expandAtomicCmpxchg(const SIMemOpInfo &MOI, + MachineBasicBlock::iterator &MI); + /// \brief Expands atomic rmw operation \p MI. Returns true if + /// instructions are added/deleted or \p MI is modified, false otherwise. + bool expandAtomicRmw(const SIMemOpInfo &MOI, + MachineBasicBlock::iterator &MI); + +public: + static char ID; + + SIMemoryLegalizer() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { + return PASS_NAME; + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +} // end namespace anonymous + +/* static */ +Optional<SIMemOpInfo> SIMemOpInfo::constructFromMIWithMMO( + const MachineBasicBlock::iterator &MI) { + assert(MI->getNumMemOperands() > 0); + + const MachineFunction *MF = MI->getParent()->getParent(); + const AMDGPUMachineModuleInfo *MMI = + &MF->getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); + + SyncScope::ID SSID = SyncScope::SingleThread; + AtomicOrdering Ordering = AtomicOrdering::NotAtomic; + AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic; + bool IsNonTemporal = true; + + // Validator should check whether or not MMOs cover the entire set of + // locations accessed by the memory instruction. + for (const auto &MMO : MI->memoperands()) { + const auto &IsSyncScopeInclusion = + MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID()); + if (!IsSyncScopeInclusion) { + reportUnknownSyncScope(MI); + return None; + } + + SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID(); + Ordering = + isStrongerThan(Ordering, MMO->getOrdering()) ? + Ordering : MMO->getOrdering(); + FailureOrdering = + isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ? + FailureOrdering : MMO->getFailureOrdering(); + + if (!(MMO->getFlags() & MachineMemOperand::MONonTemporal)) + IsNonTemporal = false; + } + + return SIMemOpInfo(SSID, Ordering, FailureOrdering, IsNonTemporal); +} + +/* static */ +Optional<SIMemOpInfo> SIMemOpInfo::getLoadInfo( + const MachineBasicBlock::iterator &MI) { + assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); + + if (!(MI->mayLoad() && !MI->mayStore())) + return None; + + // Be conservative if there are no memory operands. + if (MI->getNumMemOperands() == 0) + return SIMemOpInfo(SyncScope::System, + AtomicOrdering::SequentiallyConsistent); + + return SIMemOpInfo::constructFromMIWithMMO(MI); +} + +/* static */ +Optional<SIMemOpInfo> SIMemOpInfo::getStoreInfo( + const MachineBasicBlock::iterator &MI) { + assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); + + if (!(!MI->mayLoad() && MI->mayStore())) + return None; + + // Be conservative if there are no memory operands. + if (MI->getNumMemOperands() == 0) + return SIMemOpInfo(SyncScope::System, + AtomicOrdering::SequentiallyConsistent); + + return SIMemOpInfo::constructFromMIWithMMO(MI); +} + +/* static */ +Optional<SIMemOpInfo> SIMemOpInfo::getAtomicFenceInfo( + const MachineBasicBlock::iterator &MI) { + assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); + + if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) + return None; + + SyncScope::ID SSID = + static_cast<SyncScope::ID>(MI->getOperand(1).getImm()); + AtomicOrdering Ordering = + static_cast<AtomicOrdering>(MI->getOperand(0).getImm()); + return SIMemOpInfo(SSID, Ordering); +} + +/* static */ +Optional<SIMemOpInfo> SIMemOpInfo::getAtomicCmpxchgInfo( + const MachineBasicBlock::iterator &MI) { + assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); + + if (!(MI->mayLoad() && MI->mayStore())) + return None; + + // Be conservative if there are no memory operands. + if (MI->getNumMemOperands() == 0) + return SIMemOpInfo(SyncScope::System, + AtomicOrdering::SequentiallyConsistent, + AtomicOrdering::SequentiallyConsistent); + + return SIMemOpInfo::constructFromMIWithMMO(MI); +} + +/* static */ +Optional<SIMemOpInfo> SIMemOpInfo::getAtomicRmwInfo( + const MachineBasicBlock::iterator &MI) { + assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); + + if (!(MI->mayLoad() && MI->mayStore())) + return None; + + // Be conservative if there are no memory operands. + if (MI->getNumMemOperands() == 0) + return SIMemOpInfo(SyncScope::System, + AtomicOrdering::SequentiallyConsistent); + + return SIMemOpInfo::constructFromMIWithMMO(MI); +} + +/* static */ +void SIMemOpInfo::reportUnknownSyncScope( + const MachineBasicBlock::iterator &MI) { + DiagnosticInfoUnsupported Diag(MI->getParent()->getParent()->getFunction(), + "Unsupported synchronization scope"); + LLVMContext *CTX = &MI->getParent()->getParent()->getFunction().getContext(); + CTX->diagnose(Diag); +} + +bool SIMemoryLegalizer::insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI, + bool Before) const { + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (!Before) + ++MI; + + BuildMI(MBB, MI, DL, TII->get(Wbinvl1Opcode)); + + if (!Before) + --MI; + + return true; +} + +bool SIMemoryLegalizer::insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI, + bool Before) const { + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (!Before) + ++MI; + + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Vmcnt0Immediate); + + if (!Before) + --MI; + + return true; +} + +bool SIMemoryLegalizer::removeAtomicPseudoMIs() { + if (AtomicPseudoMIs.empty()) + return false; + + for (auto &MI : AtomicPseudoMIs) + MI->eraseFromParent(); + + AtomicPseudoMIs.clear(); + return true; +} + +bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI, + MachineBasicBlock::iterator &MI) { + assert(MI->mayLoad() && !MI->mayStore()); + + bool Changed = false; + + if (MOI.isAtomic()) { + if (MOI.getSSID() == SyncScope::System || + MOI.getSSID() == MMI->getAgentSSID()) { + if (MOI.getOrdering() == AtomicOrdering::Acquire || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) + Changed |= enableGLCBit(MI); + + if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) + Changed |= insertWaitcntVmcnt0(MI); + + if (MOI.getOrdering() == AtomicOrdering::Acquire || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { + Changed |= insertWaitcntVmcnt0(MI, false); + Changed |= insertBufferWbinvl1Vol(MI, false); + } + + return Changed; + } + + if (MOI.getSSID() == SyncScope::SingleThread || + MOI.getSSID() == MMI->getWorkgroupSSID() || + MOI.getSSID() == MMI->getWavefrontSSID()) { + return Changed; + } + + llvm_unreachable("Unsupported synchronization scope"); + } + + // Atomic instructions do not have the nontemporal attribute. + if (MOI.isNonTemporal()) { + Changed |= enableGLCBit(MI); + Changed |= enableSLCBit(MI); + return Changed; + } + + return Changed; +} + +bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, + MachineBasicBlock::iterator &MI) { + assert(!MI->mayLoad() && MI->mayStore()); + + bool Changed = false; + + if (MOI.isAtomic()) { + if (MOI.getSSID() == SyncScope::System || + MOI.getSSID() == MMI->getAgentSSID()) { + if (MOI.getOrdering() == AtomicOrdering::Release || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) + Changed |= insertWaitcntVmcnt0(MI); + + return Changed; + } + + if (MOI.getSSID() == SyncScope::SingleThread || + MOI.getSSID() == MMI->getWorkgroupSSID() || + MOI.getSSID() == MMI->getWavefrontSSID()) { + return Changed; + } + + llvm_unreachable("Unsupported synchronization scope"); + } + + // Atomic instructions do not have the nontemporal attribute. + if (MOI.isNonTemporal()) { + Changed |= enableGLCBit(MI); + Changed |= enableSLCBit(MI); + return Changed; + } + + return Changed; +} + +bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI, + MachineBasicBlock::iterator &MI) { + assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); + + bool Changed = false; + + if (MOI.isAtomic()) { + if (MOI.getSSID() == SyncScope::System || + MOI.getSSID() == MMI->getAgentSSID()) { + if (MOI.getOrdering() == AtomicOrdering::Acquire || + MOI.getOrdering() == AtomicOrdering::Release || + MOI.getOrdering() == AtomicOrdering::AcquireRelease || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) + Changed |= insertWaitcntVmcnt0(MI); + + if (MOI.getOrdering() == AtomicOrdering::Acquire || + MOI.getOrdering() == AtomicOrdering::AcquireRelease || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) + Changed |= insertBufferWbinvl1Vol(MI); + + AtomicPseudoMIs.push_back(MI); + return Changed; + } + + if (MOI.getSSID() == SyncScope::SingleThread || + MOI.getSSID() == MMI->getWorkgroupSSID() || + MOI.getSSID() == MMI->getWavefrontSSID()) { + AtomicPseudoMIs.push_back(MI); + return Changed; + } + + SIMemOpInfo::reportUnknownSyncScope(MI); + } + + return Changed; +} + +bool SIMemoryLegalizer::expandAtomicCmpxchg(const SIMemOpInfo &MOI, + MachineBasicBlock::iterator &MI) { + assert(MI->mayLoad() && MI->mayStore()); + + bool Changed = false; + + if (MOI.isAtomic()) { + if (MOI.getSSID() == SyncScope::System || + MOI.getSSID() == MMI->getAgentSSID()) { + if (MOI.getOrdering() == AtomicOrdering::Release || + MOI.getOrdering() == AtomicOrdering::AcquireRelease || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || + MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) + Changed |= insertWaitcntVmcnt0(MI); + + if (MOI.getOrdering() == AtomicOrdering::Acquire || + MOI.getOrdering() == AtomicOrdering::AcquireRelease || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent || + MOI.getFailureOrdering() == AtomicOrdering::Acquire || + MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) { + Changed |= insertWaitcntVmcnt0(MI, false); + Changed |= insertBufferWbinvl1Vol(MI, false); + } + + return Changed; + } + + if (MOI.getSSID() == SyncScope::SingleThread || + MOI.getSSID() == MMI->getWorkgroupSSID() || + MOI.getSSID() == MMI->getWavefrontSSID()) { + Changed |= enableGLCBit(MI); + return Changed; + } + + llvm_unreachable("Unsupported synchronization scope"); + } + + return Changed; +} + +bool SIMemoryLegalizer::expandAtomicRmw(const SIMemOpInfo &MOI, + MachineBasicBlock::iterator &MI) { + assert(MI->mayLoad() && MI->mayStore()); + + bool Changed = false; + + if (MOI.isAtomic()) { + if (MOI.getSSID() == SyncScope::System || + MOI.getSSID() == MMI->getAgentSSID()) { + if (MOI.getOrdering() == AtomicOrdering::Release || + MOI.getOrdering() == AtomicOrdering::AcquireRelease || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) + Changed |= insertWaitcntVmcnt0(MI); + + if (MOI.getOrdering() == AtomicOrdering::Acquire || + MOI.getOrdering() == AtomicOrdering::AcquireRelease || + MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) { + Changed |= insertWaitcntVmcnt0(MI, false); + Changed |= insertBufferWbinvl1Vol(MI, false); + } + + return Changed; + } + + if (MOI.getSSID() == SyncScope::SingleThread || + MOI.getSSID() == MMI->getWorkgroupSSID() || + MOI.getSSID() == MMI->getWavefrontSSID()) { + Changed |= enableGLCBit(MI); + return Changed; + } + + llvm_unreachable("Unsupported synchronization scope"); + } + + return Changed; +} + +bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { + bool Changed = false; + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const IsaInfo::IsaVersion IV = IsaInfo::getIsaVersion(ST.getFeatureBits()); + + MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>(); + TII = ST.getInstrInfo(); + + Vmcnt0Immediate = + AMDGPU::encodeWaitcnt(IV, 0, getExpcntBitMask(IV), getLgkmcntBitMask(IV)); + Wbinvl1Opcode = ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS ? + AMDGPU::BUFFER_WBINVL1 : AMDGPU::BUFFER_WBINVL1_VOL; + + for (auto &MBB : MF) { + for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { + if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) + continue; + + if (const auto &MOI = SIMemOpInfo::getLoadInfo(MI)) + Changed |= expandLoad(MOI.getValue(), MI); + else if (const auto &MOI = SIMemOpInfo::getStoreInfo(MI)) + Changed |= expandStore(MOI.getValue(), MI); + else if (const auto &MOI = SIMemOpInfo::getAtomicFenceInfo(MI)) + Changed |= expandAtomicFence(MOI.getValue(), MI); + else if (const auto &MOI = SIMemOpInfo::getAtomicCmpxchgInfo(MI)) + Changed |= expandAtomicCmpxchg(MOI.getValue(), MI); + else if (const auto &MOI = SIMemOpInfo::getAtomicRmwInfo(MI)) + Changed |= expandAtomicRmw(MOI.getValue(), MI); + } + } + + Changed |= removeAtomicPseudoMIs(); + return Changed; +} + +INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) + +char SIMemoryLegalizer::ID = 0; +char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; + +FunctionPass *llvm::createSIMemoryLegalizerPass() { + return new SIMemoryLegalizer(); +} diff --git a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp index 4d2f917278e9..2dc6f2702b3b 100644 --- a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp +++ b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp @@ -10,7 +10,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -87,6 +87,30 @@ static unsigned isCopyToExec(const MachineInstr &MI) { return AMDGPU::NoRegister; } +/// If \p MI is a logical operation on an exec value, +/// return the register copied to. +static unsigned isLogicalOpOnExec(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AMDGPU::S_AND_B64: + case AMDGPU::S_OR_B64: + case AMDGPU::S_XOR_B64: + case AMDGPU::S_ANDN2_B64: + case AMDGPU::S_ORN2_B64: + case AMDGPU::S_NAND_B64: + case AMDGPU::S_NOR_B64: + case AMDGPU::S_XNOR_B64: { + const MachineOperand &Src1 = MI.getOperand(1); + if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC) + return MI.getOperand(0).getReg(); + const MachineOperand &Src2 = MI.getOperand(2); + if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC) + return MI.getOperand(0).getReg(); + } + } + + return AMDGPU::NoRegister; +} + static unsigned getSaveExecOp(unsigned Opc) { switch (Opc) { case AMDGPU::S_AND_B64: @@ -181,6 +205,9 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) { } bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); const SIInstrInfo *TII = ST.getInstrInfo(); @@ -209,8 +236,24 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { // Scan backwards to find the def. auto CopyToExecInst = &*I; auto CopyFromExecInst = findExecCopy(*TII, MBB, I, CopyToExec); - if (CopyFromExecInst == E) + if (CopyFromExecInst == E) { + auto PrepareExecInst = std::next(I); + if (PrepareExecInst == E) + continue; + // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec + if (CopyToExecInst->getOperand(1).isKill() && + isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) { + DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst); + + PrepareExecInst->getOperand(0).setReg(AMDGPU::EXEC); + + DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n'); + + CopyToExecInst->eraseFromParent(); + } + continue; + } if (isLiveOut(MBB, CopyToExec)) { // The copied register is live out and has a second use in another block. @@ -233,10 +276,12 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { break; } + bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI); + if (J->modifiesRegister(CopyToExec, TRI)) { if (SaveExecInst) { DEBUG(dbgs() << "Multiple instructions modify " - << PrintReg(CopyToExec, TRI) << '\n'); + << printReg(CopyToExec, TRI) << '\n'); SaveExecInst = nullptr; break; } @@ -245,7 +290,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END) break; - if (J->readsRegister(CopyFromExec, TRI)) { + if (ReadsCopyFromExec) { SaveExecInst = &*J; DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n'); continue; @@ -253,6 +298,18 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "Instruction does not read exec copy: " << *J << '\n'); break; } + } else if (ReadsCopyFromExec && !SaveExecInst) { + // Make sure no other instruction is trying to use this copy, before it + // will be rewritten by the saveexec, i.e. hasOneUse. There may have + // been another use, such as an inserted spill. For example: + // + // %sgpr0_sgpr1 = COPY %exec + // spill %sgpr0_sgpr1 + // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1 + // + DEBUG(dbgs() << "Found second use of save inst candidate: " + << *J << '\n'); + break; } if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) { diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp new file mode 100644 index 000000000000..83074773c495 --- /dev/null +++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp @@ -0,0 +1,252 @@ +//===-- SIOptimizeExecMaskingPreRA.cpp ------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This pass removes redundant S_OR_B64 instructions enabling lanes in +/// the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any +/// vector instructions between them we can only keep outer SI_END_CF, given +/// that CFG is structured and exec bits of the outer end statement are always +/// not less than exec bit of the inner one. +/// +/// This needs to be done before the RA to eliminate saved exec bits registers +/// but after register coalescer to have no vector registers copies in between +/// of different end cf statements. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-optimize-exec-masking-pre-ra" + +namespace { + +class SIOptimizeExecMaskingPreRA : public MachineFunctionPass { +public: + static char ID; + +public: + SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID) { + initializeSIOptimizeExecMaskingPreRAPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "SI optimize exec mask operations pre-RA"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LiveIntervals>(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SIOptimizeExecMaskingPreRA, DEBUG_TYPE, + "SI optimize exec mask operations pre-RA", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_END(SIOptimizeExecMaskingPreRA, DEBUG_TYPE, + "SI optimize exec mask operations pre-RA", false, false) + +char SIOptimizeExecMaskingPreRA::ID = 0; + +char &llvm::SIOptimizeExecMaskingPreRAID = SIOptimizeExecMaskingPreRA::ID; + +FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() { + return new SIOptimizeExecMaskingPreRA(); +} + +static bool isEndCF(const MachineInstr& MI, const SIRegisterInfo* TRI) { + return MI.getOpcode() == AMDGPU::S_OR_B64 && + MI.modifiesRegister(AMDGPU::EXEC, TRI); +} + +static bool isFullExecCopy(const MachineInstr& MI) { + return MI.isFullCopy() && MI.getOperand(1).getReg() == AMDGPU::EXEC; +} + +static unsigned getOrNonExecReg(const MachineInstr &MI, + const SIInstrInfo &TII) { + auto Op = TII.getNamedOperand(MI, AMDGPU::OpName::src1); + if (Op->isReg() && Op->getReg() != AMDGPU::EXEC) + return Op->getReg(); + Op = TII.getNamedOperand(MI, AMDGPU::OpName::src0); + if (Op->isReg() && Op->getReg() != AMDGPU::EXEC) + return Op->getReg(); + return AMDGPU::NoRegister; +} + +static MachineInstr* getOrExecSource(const MachineInstr &MI, + const SIInstrInfo &TII, + const MachineRegisterInfo &MRI) { + auto SavedExec = getOrNonExecReg(MI, TII); + if (SavedExec == AMDGPU::NoRegister) + return nullptr; + auto SaveExecInst = MRI.getUniqueVRegDef(SavedExec); + if (!SaveExecInst || !isFullExecCopy(*SaveExecInst)) + return nullptr; + return SaveExecInst; +} + +bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + LiveIntervals *LIS = &getAnalysis<LiveIntervals>(); + DenseSet<unsigned> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI}); + bool Changed = false; + + for (MachineBasicBlock &MBB : MF) { + + // Try to remove unneeded instructions before s_endpgm. + if (MBB.succ_empty()) { + if (MBB.empty() || MBB.back().getOpcode() != AMDGPU::S_ENDPGM) + continue; + + SmallVector<MachineBasicBlock*, 4> Blocks({&MBB}); + + while (!Blocks.empty()) { + auto CurBB = Blocks.pop_back_val(); + auto I = CurBB->rbegin(), E = CurBB->rend(); + if (I != E) { + if (I->isUnconditionalBranch() || I->getOpcode() == AMDGPU::S_ENDPGM) + ++I; + else if (I->isBranch()) + continue; + } + + while (I != E) { + if (I->isDebugValue()) { + I = std::next(I); + continue; + } + + if (I->mayStore() || I->isBarrier() || I->isCall() || + I->hasUnmodeledSideEffects() || I->hasOrderedMemoryRef()) + break; + + DEBUG(dbgs() << "Removing no effect instruction: " << *I << '\n'); + + for (auto &Op : I->operands()) { + if (Op.isReg()) + RecalcRegs.insert(Op.getReg()); + } + + auto Next = std::next(I); + LIS->RemoveMachineInstrFromMaps(*I); + I->eraseFromParent(); + I = Next; + + Changed = true; + } + + if (I != E) + continue; + + // Try to ascend predecessors. + for (auto *Pred : CurBB->predecessors()) { + if (Pred->succ_size() == 1) + Blocks.push_back(Pred); + } + } + continue; + } + + // Try to collapse adjacent endifs. + auto Lead = MBB.begin(), E = MBB.end(); + if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI)) + continue; + + const MachineBasicBlock* Succ = *MBB.succ_begin(); + if (!MBB.isLayoutSuccessor(Succ)) + continue; + + auto I = std::next(Lead); + + for ( ; I != E; ++I) + if (!TII->isSALU(*I) || I->readsRegister(AMDGPU::EXEC, TRI)) + break; + + if (I != E) + continue; + + const auto NextLead = Succ->begin(); + if (NextLead == Succ->end() || !isEndCF(*NextLead, TRI) || + !getOrExecSource(*NextLead, *TII, MRI)) + continue; + + DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n'); + + auto SaveExec = getOrExecSource(*Lead, *TII, MRI); + unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII); + for (auto &Op : Lead->operands()) { + if (Op.isReg()) + RecalcRegs.insert(Op.getReg()); + } + + LIS->RemoveMachineInstrFromMaps(*Lead); + Lead->eraseFromParent(); + if (SaveExecReg) { + LIS->removeInterval(SaveExecReg); + LIS->createAndComputeVirtRegInterval(SaveExecReg); + } + + Changed = true; + + // If the only use of saved exec in the removed instruction is S_AND_B64 + // fold the copy now. + if (!SaveExec || !SaveExec->isFullCopy()) + continue; + + unsigned SavedExec = SaveExec->getOperand(0).getReg(); + bool SafeToReplace = true; + for (auto& U : MRI.use_nodbg_instructions(SavedExec)) { + if (U.getParent() != SaveExec->getParent()) { + SafeToReplace = false; + break; + } + + DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n'); + } + + if (SafeToReplace) { + LIS->RemoveMachineInstrFromMaps(*SaveExec); + SaveExec->eraseFromParent(); + MRI.replaceRegWith(SavedExec, AMDGPU::EXEC); + LIS->removeInterval(SavedExec); + } + } + + if (Changed) { + for (auto Reg : RecalcRegs) { + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + LIS->removeInterval(Reg); + if (!MRI.reg_empty(Reg)) + LIS->createAndComputeVirtRegInterval(Reg); + } else { + for (MCRegUnitIterator U(Reg, TRI); U.isValid(); ++U) + LIS->removeRegUnit(*U); + } + } + } + + return Changed; +} diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index e2ac6631d2f3..5ed7fdf220bf 100644 --- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -1,4 +1,4 @@ -//===-- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions --===// +//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// // // The LLVM Compiler Infrastructure // @@ -10,12 +10,12 @@ /// \file This pass tries to apply several peephole SDWA patterns. /// /// E.g. original: -/// V_LSHRREV_B32_e32 %vreg0, 16, %vreg1 -/// V_ADD_I32_e32 %vreg2, %vreg0, %vreg3 -/// V_LSHLREV_B32_e32 %vreg4, 16, %vreg2 +/// V_LSHRREV_B32_e32 %0, 16, %1 +/// V_ADD_I32_e32 %2, %0, %3 +/// V_LSHLREV_B32_e32 %4, 16, %2 /// /// Replace: -/// V_ADD_I32_sdwa %vreg4, %vreg1, %vreg3 +/// V_ADD_I32_sdwa %4, %1, %3 /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD /// //===----------------------------------------------------------------------===// @@ -24,12 +24,31 @@ #include "AMDGPUSubtarget.h" #include "SIDefines.h" #include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/MC/LaneBitmask.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <memory> #include <unordered_map> -#include <unordered_set> using namespace llvm; @@ -42,10 +61,11 @@ STATISTIC(NumSDWAInstructionsPeepholed, namespace { class SDWAOperand; +class SDWADstOperand; class SIPeepholeSDWA : public MachineFunctionPass { public: - typedef SmallVector<SDWAOperand *, 4> SDWAOperandsVector; + using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; private: MachineRegisterInfo *MRI; @@ -67,6 +87,7 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; void matchSDWAOperands(MachineFunction &MF); + std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const; bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const; @@ -91,7 +112,7 @@ public: assert(Replaced->isReg()); } - virtual ~SDWAOperand() {} + virtual ~SDWAOperand() = default; virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; @@ -99,9 +120,15 @@ public: MachineOperand *getTargetOperand() const { return Target; } MachineOperand *getReplacedOperand() const { return Replaced; } MachineInstr *getParentInst() const { return Target->getParent(); } + MachineRegisterInfo *getMRI() const { return &getParentInst()->getParent()->getParent()->getRegInfo(); } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + virtual void print(raw_ostream& OS) const = 0; + void dump() const { print(dbgs()); } +#endif }; using namespace AMDGPU::SDWA; @@ -117,11 +144,11 @@ public: SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, bool Sext_ = false) - : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_), - Neg(Neg_), Sext(Sext_) {} + : SDWAOperand(TargetOp, ReplacedOp), + SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} - virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; - virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; + MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; + bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; SdwaSel getSrcSel() const { return SrcSel; } bool getAbs() const { return Abs; } @@ -130,6 +157,10 @@ public: uint64_t getSrcMods(const SIInstrInfo *TII, const MachineOperand *SrcOp) const; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void print(raw_ostream& OS) const override; +#endif }; class SDWADstOperand : public SDWAOperand { @@ -138,18 +169,42 @@ private: DstUnused DstUn; public: + SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) - : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} + : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} - virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; - virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; + MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; + bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; SdwaSel getDstSel() const { return DstSel; } DstUnused getDstUnused() const { return DstUn; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void print(raw_ostream& OS) const override; +#endif +}; + +class SDWADstPreserveOperand : public SDWADstOperand { +private: + MachineOperand *Preserve; + +public: + SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, + MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) + : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), + Preserve(PreserveOp) {} + + bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; + + MachineOperand *getPreservedOperand() const { return Preserve; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void print(raw_ostream& OS) const override; +#endif }; -} // End anonymous namespace. +} // end anonymous namespace INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) @@ -161,8 +216,8 @@ FunctionPass *llvm::createSIPeepholeSDWAPass() { return new SIPeepholeSDWA(); } -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) { switch(Sel) { case BYTE_0: OS << "BYTE_0"; break; @@ -185,19 +240,31 @@ static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { return OS; } -static raw_ostream& operator<<(raw_ostream &OS, const SDWASrcOperand &Src) { - OS << "SDWA src: " << *Src.getTargetOperand() - << " src_sel:" << Src.getSrcSel() - << " abs:" << Src.getAbs() << " neg:" << Src.getNeg() - << " sext:" << Src.getSext() << '\n'; +static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { + Operand.print(OS); return OS; } -static raw_ostream& operator<<(raw_ostream &OS, const SDWADstOperand &Dst) { - OS << "SDWA dst: " << *Dst.getTargetOperand() - << " dst_sel:" << Dst.getDstSel() - << " dst_unused:" << Dst.getDstUnused() << '\n'; - return OS; +LLVM_DUMP_METHOD +void SDWASrcOperand::print(raw_ostream& OS) const { + OS << "SDWA src: " << *getTargetOperand() + << " src_sel:" << getSrcSel() + << " abs:" << getAbs() << " neg:" << getNeg() + << " sext:" << getSext() << '\n'; +} + +LLVM_DUMP_METHOD +void SDWADstOperand::print(raw_ostream& OS) const { + OS << "SDWA dst: " << *getTargetOperand() + << " dst_sel:" << getDstSel() + << " dst_unused:" << getDstUnused() << '\n'; +} + +LLVM_DUMP_METHOD +void SDWADstPreserveOperand::print(raw_ostream& OS) const { + OS << "SDWA preserve dst: " << *getTargetOperand() + << " dst_sel:" << getDstSel() + << " preserve:" << *getPreservedOperand() << '\n'; } #endif @@ -221,23 +288,44 @@ static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { LHS.getSubReg() == RHS.getSubReg(); } -static bool isSubregOf(const MachineOperand &SubReg, - const MachineOperand &SuperReg, - const TargetRegisterInfo *TRI) { +static MachineOperand *findSingleRegUse(const MachineOperand *Reg, + const MachineRegisterInfo *MRI) { + if (!Reg->isReg() || !Reg->isDef()) + return nullptr; - if (!SuperReg.isReg() || !SubReg.isReg()) - return false; + MachineOperand *ResMO = nullptr; + for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) { + // If there exist use of subreg of Reg then return nullptr + if (!isSameReg(UseMO, *Reg)) + return nullptr; - if (isSameReg(SuperReg, SubReg)) - return true; + // Check that there is only one instruction that uses Reg + if (!ResMO) { + ResMO = &UseMO; + } else if (ResMO->getParent() != UseMO.getParent()) { + return nullptr; + } + } - if (SuperReg.getReg() != SubReg.getReg()) - return false; + return ResMO; +} + +static MachineOperand *findSingleRegDef(const MachineOperand *Reg, + const MachineRegisterInfo *MRI) { + if (!Reg->isReg()) + return nullptr; + + MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg()); + if (!DefInstr) + return nullptr; + + for (auto &DefMO : DefInstr->defs()) { + if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) + return &DefMO; + } - LaneBitmask SuperMask = TRI->getSubRegIndexLaneMask(SuperReg.getSubReg()); - LaneBitmask SubMask = TRI->getSubRegIndexLaneMask(SubReg.getSubReg()); - SuperMask |= ~SubMask; - return SuperMask.all(); + // Ignore implicit defs. + return nullptr; } uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, @@ -268,30 +356,11 @@ uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { // For SDWA src operand potential instruction is one that use register // defined by parent instruction - MachineRegisterInfo *MRI = getMRI(); - MachineOperand *Replaced = getReplacedOperand(); - assert(Replaced->isReg()); - - MachineInstr *PotentialMI = nullptr; - for (MachineOperand &PotentialMO : MRI->use_operands(Replaced->getReg())) { - // If this is use of another subreg of dst reg then do nothing - if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo())) - continue; + MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); + if (!PotentialMO) + return nullptr; - // If there exist use of superreg of dst then we should not combine this - // opernad - if (!isSameReg(PotentialMO, *Replaced)) - return nullptr; - - // Check that PotentialMI is only instruction that uses dst reg - if (PotentialMI == nullptr) { - PotentialMI = PotentialMO.getParent(); - } else if (PotentialMI != PotentialMO.getParent()) { - return nullptr; - } - } - - return PotentialMI; + return PotentialMO->getParent(); } bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { @@ -313,7 +382,7 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && - !isSameReg(*Src, *getReplacedOperand())) { + !isSameReg(*Src, *getReplacedOperand())) { // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to // src2. This is not allowed. return false; @@ -333,29 +402,18 @@ MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { // that this operand uses MachineRegisterInfo *MRI = getMRI(); MachineInstr *ParentMI = getParentInst(); - MachineOperand *Replaced = getReplacedOperand(); - assert(Replaced->isReg()); - for (MachineOperand &PotentialMO : MRI->def_operands(Replaced->getReg())) { - if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo())) - continue; + MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI); + if (!PotentialMO) + return nullptr; - if (!isSameReg(*Replaced, PotentialMO)) + // Check that ParentMI is the only instruction that uses replaced register + for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) { + if (&UseInst != ParentMI) return nullptr; - - // Check that ParentMI is the only instruction that uses replaced register - for (MachineOperand &UseMO : MRI->use_operands(PotentialMO.getReg())) { - if (isSubregOf(UseMO, PotentialMO, MRI->getTargetRegisterInfo()) && - UseMO.getParent() != ParentMI) { - return nullptr; - } - } - - // Due to SSA this should be onle def of replaced register, so return it - return PotentialMO.getParent(); } - return nullptr; + return PotentialMO->getParent(); } bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { @@ -386,13 +444,43 @@ bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { return true; } +bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, + const SIInstrInfo *TII) { + // MI should be moved right before v_or_b32. + // For this we should clear all kill flags on uses of MI src-operands or else + // we can encounter problem with use of killed operand. + for (MachineOperand &MO : MI.uses()) { + if (!MO.isReg()) + continue; + getMRI()->clearKillFlags(MO.getReg()); + } + + // Move MI before v_or_b32 + auto MBB = MI.getParent(); + MBB->remove(&MI); + MBB->insert(getParentInst(), &MI); + + // Add Implicit use of preserved register + MachineInstrBuilder MIB(*MBB->getParent(), MI); + MIB.addReg(getPreservedOperand()->getReg(), + RegState::ImplicitKill, + getPreservedOperand()->getSubReg()); + + // Tie dst to implicit use + MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst), + MI.getNumOperands() - 1); + + // Convert MI as any other SDWADstOperand and remove v_or_b32 + return SDWADstOperand::convertToSDWA(MI, TII); +} + Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { if (Op.isImm()) { return Op.getImm(); } // If this is not immediate then it can be copy of immediate value, e.g.: - // %vreg1<def> = S_MOV_B32 255; + // %1 = S_MOV_B32 255; if (Op.isReg()) { for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { if (!isSameReg(Op, Def)) @@ -413,195 +501,316 @@ Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { return None; } -void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) { - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - unsigned Opcode = MI.getOpcode(); - switch (Opcode) { - case AMDGPU::V_LSHRREV_B32_e32: - case AMDGPU::V_ASHRREV_I32_e32: - case AMDGPU::V_LSHLREV_B32_e32: - case AMDGPU::V_LSHRREV_B32_e64: - case AMDGPU::V_ASHRREV_I32_e64: - case AMDGPU::V_LSHLREV_B32_e64: { - // from: v_lshrrev_b32_e32 v1, 16/24, v0 - // to SDWA src:v0 src_sel:WORD_1/BYTE_3 - - // from: v_ashrrev_i32_e32 v1, 16/24, v0 - // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 - - // from: v_lshlrev_b32_e32 v1, 16/24, v0 - // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD - MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - auto Imm = foldToImm(*Src0); - if (!Imm) - break; - - if (*Imm != 16 && *Imm != 24) - break; - - MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - if (TRI->isPhysicalRegister(Src1->getReg()) || - TRI->isPhysicalRegister(Dst->getReg())) - break; - - if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || - Opcode == AMDGPU::V_LSHLREV_B32_e64) { - auto SDWADst = make_unique<SDWADstOperand>( - Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); - DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); - SDWAOperands[&MI] = std::move(SDWADst); - ++NumSDWAPatternsFound; - } else { - auto SDWASrc = make_unique<SDWASrcOperand>( - Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, - Opcode != AMDGPU::V_LSHRREV_B32_e32 && - Opcode != AMDGPU::V_LSHRREV_B32_e64); - DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); - SDWAOperands[&MI] = std::move(SDWASrc); - ++NumSDWAPatternsFound; - } - break; - } +std::unique_ptr<SDWAOperand> +SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AMDGPU::V_LSHRREV_B32_e32: + case AMDGPU::V_ASHRREV_I32_e32: + case AMDGPU::V_LSHLREV_B32_e32: + case AMDGPU::V_LSHRREV_B32_e64: + case AMDGPU::V_ASHRREV_I32_e64: + case AMDGPU::V_LSHLREV_B32_e64: { + // from: v_lshrrev_b32_e32 v1, 16/24, v0 + // to SDWA src:v0 src_sel:WORD_1/BYTE_3 + + // from: v_ashrrev_i32_e32 v1, 16/24, v0 + // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 + + // from: v_lshlrev_b32_e32 v1, 16/24, v0 + // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + auto Imm = foldToImm(*Src0); + if (!Imm) + break; + + if (*Imm != 16 && *Imm != 24) + break; + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + if (TRI->isPhysicalRegister(Src1->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || + Opcode == AMDGPU::V_LSHLREV_B32_e64) { + return make_unique<SDWADstOperand>( + Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); + } else { + return make_unique<SDWASrcOperand>( + Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, + Opcode != AMDGPU::V_LSHRREV_B32_e32 && + Opcode != AMDGPU::V_LSHRREV_B32_e64); + } + break; + } - case AMDGPU::V_LSHRREV_B16_e32: - case AMDGPU::V_ASHRREV_I16_e32: - case AMDGPU::V_LSHLREV_B16_e32: - case AMDGPU::V_LSHRREV_B16_e64: - case AMDGPU::V_ASHRREV_I16_e64: - case AMDGPU::V_LSHLREV_B16_e64: { - // from: v_lshrrev_b16_e32 v1, 8, v0 - // to SDWA src:v0 src_sel:BYTE_1 - - // from: v_ashrrev_i16_e32 v1, 8, v0 - // to SDWA src:v0 src_sel:BYTE_1 sext:1 - - // from: v_lshlrev_b16_e32 v1, 8, v0 - // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD - MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - auto Imm = foldToImm(*Src0); - if (!Imm || *Imm != 8) - break; - - MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - - if (TRI->isPhysicalRegister(Src1->getReg()) || - TRI->isPhysicalRegister(Dst->getReg())) - break; - - if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || - Opcode == AMDGPU::V_LSHLREV_B16_e64) { - auto SDWADst = - make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); - DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n'); - SDWAOperands[&MI] = std::move(SDWADst); - ++NumSDWAPatternsFound; - } else { - auto SDWASrc = make_unique<SDWASrcOperand>( - Src1, Dst, BYTE_1, false, false, - Opcode != AMDGPU::V_LSHRREV_B16_e32 && - Opcode != AMDGPU::V_LSHRREV_B16_e64); - DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); - SDWAOperands[&MI] = std::move(SDWASrc); - ++NumSDWAPatternsFound; - } - break; - } + case AMDGPU::V_LSHRREV_B16_e32: + case AMDGPU::V_ASHRREV_I16_e32: + case AMDGPU::V_LSHLREV_B16_e32: + case AMDGPU::V_LSHRREV_B16_e64: + case AMDGPU::V_ASHRREV_I16_e64: + case AMDGPU::V_LSHLREV_B16_e64: { + // from: v_lshrrev_b16_e32 v1, 8, v0 + // to SDWA src:v0 src_sel:BYTE_1 + + // from: v_ashrrev_i16_e32 v1, 8, v0 + // to SDWA src:v0 src_sel:BYTE_1 sext:1 + + // from: v_lshlrev_b16_e32 v1, 8, v0 + // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + auto Imm = foldToImm(*Src0); + if (!Imm || *Imm != 8) + break; + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + + if (TRI->isPhysicalRegister(Src1->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || + Opcode == AMDGPU::V_LSHLREV_B16_e64) { + return make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); + } else { + return make_unique<SDWASrcOperand>( + Src1, Dst, BYTE_1, false, false, + Opcode != AMDGPU::V_LSHRREV_B16_e32 && + Opcode != AMDGPU::V_LSHRREV_B16_e64); + } + break; + } - case AMDGPU::V_BFE_I32: - case AMDGPU::V_BFE_U32: { - // e.g.: - // from: v_bfe_u32 v1, v0, 8, 8 - // to SDWA src:v0 src_sel:BYTE_1 - - // offset | width | src_sel - // ------------------------ - // 0 | 8 | BYTE_0 - // 0 | 16 | WORD_0 - // 0 | 32 | DWORD ? - // 8 | 8 | BYTE_1 - // 16 | 8 | BYTE_2 - // 16 | 16 | WORD_1 - // 24 | 8 | BYTE_3 - - MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - auto Offset = foldToImm(*Src1); - if (!Offset) - break; - - MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); - auto Width = foldToImm(*Src2); - if (!Width) - break; - - SdwaSel SrcSel = DWORD; - - if (*Offset == 0 && *Width == 8) - SrcSel = BYTE_0; - else if (*Offset == 0 && *Width == 16) - SrcSel = WORD_0; - else if (*Offset == 0 && *Width == 32) - SrcSel = DWORD; - else if (*Offset == 8 && *Width == 8) - SrcSel = BYTE_1; - else if (*Offset == 16 && *Width == 8) - SrcSel = BYTE_2; - else if (*Offset == 16 && *Width == 16) - SrcSel = WORD_1; - else if (*Offset == 24 && *Width == 8) - SrcSel = BYTE_3; - else - break; - - MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - - if (TRI->isPhysicalRegister(Src0->getReg()) || - TRI->isPhysicalRegister(Dst->getReg())) - break; - - auto SDWASrc = make_unique<SDWASrcOperand>( - Src0, Dst, SrcSel, false, false, - Opcode == AMDGPU::V_BFE_U32 ? false : true); - DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); - SDWAOperands[&MI] = std::move(SDWASrc); - ++NumSDWAPatternsFound; + case AMDGPU::V_BFE_I32: + case AMDGPU::V_BFE_U32: { + // e.g.: + // from: v_bfe_u32 v1, v0, 8, 8 + // to SDWA src:v0 src_sel:BYTE_1 + + // offset | width | src_sel + // ------------------------ + // 0 | 8 | BYTE_0 + // 0 | 16 | WORD_0 + // 0 | 32 | DWORD ? + // 8 | 8 | BYTE_1 + // 16 | 8 | BYTE_2 + // 16 | 16 | WORD_1 + // 24 | 8 | BYTE_3 + + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + auto Offset = foldToImm(*Src1); + if (!Offset) + break; + + MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + auto Width = foldToImm(*Src2); + if (!Width) + break; + + SdwaSel SrcSel = DWORD; + + if (*Offset == 0 && *Width == 8) + SrcSel = BYTE_0; + else if (*Offset == 0 && *Width == 16) + SrcSel = WORD_0; + else if (*Offset == 0 && *Width == 32) + SrcSel = DWORD; + else if (*Offset == 8 && *Width == 8) + SrcSel = BYTE_1; + else if (*Offset == 16 && *Width == 8) + SrcSel = BYTE_2; + else if (*Offset == 16 && *Width == 16) + SrcSel = WORD_1; + else if (*Offset == 24 && *Width == 8) + SrcSel = BYTE_3; + else + break; + + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + + if (TRI->isPhysicalRegister(Src0->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + return make_unique<SDWASrcOperand>( + Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32); + } + + case AMDGPU::V_AND_B32_e32: + case AMDGPU::V_AND_B32_e64: { + // e.g.: + // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 + // to SDWA src:v0 src_sel:WORD_0/BYTE_0 + + MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + auto ValSrc = Src1; + auto Imm = foldToImm(*Src0); + + if (!Imm) { + Imm = foldToImm(*Src1); + ValSrc = Src0; + } + + if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) + break; + + MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + + if (TRI->isPhysicalRegister(Src1->getReg()) || + TRI->isPhysicalRegister(Dst->getReg())) + break; + + return make_unique<SDWASrcOperand>( + ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); + } + + case AMDGPU::V_OR_B32_e32: + case AMDGPU::V_OR_B32_e64: { + // Patterns for dst_unused:UNUSED_PRESERVE. + // e.g., from: + // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD + // src1_sel:WORD_1 src2_sel:WORD1 + // v_add_f16_e32 v3, v1, v2 + // v_or_b32_e32 v4, v0, v3 + // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 + + // Check if one of operands of v_or_b32 is SDWA instruction + using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>; + auto CheckOROperandsForSDWA = + [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { + if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) + return CheckRetType(None); + + MachineOperand *Op1Def = findSingleRegDef(Op1, MRI); + if (!Op1Def) + return CheckRetType(None); + + MachineInstr *Op1Inst = Op1Def->getParent(); + if (!TII->isSDWA(*Op1Inst)) + return CheckRetType(None); + + MachineOperand *Op2Def = findSingleRegDef(Op2, MRI); + if (!Op2Def) + return CheckRetType(None); + + return CheckRetType(std::make_pair(Op1Def, Op2Def)); + }; + + MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + assert(OrSDWA && OrOther); + auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); + if (!Res) { + OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + assert(OrSDWA && OrOther); + Res = CheckOROperandsForSDWA(OrSDWA, OrOther); + if (!Res) break; - } - case AMDGPU::V_AND_B32_e32: - case AMDGPU::V_AND_B32_e64: { - // e.g.: - // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 - // to SDWA src:v0 src_sel:WORD_0/BYTE_0 - - MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - auto ValSrc = Src1; - auto Imm = foldToImm(*Src0); - - if (!Imm) { - Imm = foldToImm(*Src1); - ValSrc = Src0; - } - - if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) - break; - - MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); - - if (TRI->isPhysicalRegister(Src1->getReg()) || - TRI->isPhysicalRegister(Dst->getReg())) - break; - - auto SDWASrc = make_unique<SDWASrcOperand>( - ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); - DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n'); - SDWAOperands[&MI] = std::move(SDWASrc); + } + + MachineOperand *OrSDWADef = Res->first; + MachineOperand *OrOtherDef = Res->second; + assert(OrSDWADef && OrOtherDef); + + MachineInstr *SDWAInst = OrSDWADef->getParent(); + MachineInstr *OtherInst = OrOtherDef->getParent(); + + // Check that OtherInstr is actually bitwise compatible with SDWAInst = their + // destination patterns don't overlap. Compatible instruction can be either + // regular instruction with compatible bitness or SDWA instruction with + // correct dst_sel + // SDWAInst | OtherInst bitness / OtherInst dst_sel + // ----------------------------------------------------- + // DWORD | no / no + // WORD_0 | no / BYTE_2/3, WORD_1 + // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 + // BYTE_0 | no / BYTE_1/2/3, WORD_1 + // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 + // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 + // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 + // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK + // but v_add_f32 is not. + + // TODO: add support for non-SDWA instructions as OtherInst. + // For now this only works with SDWA instructions. For regular instructions + // there is no way to determine if instruction write only 8/16/24-bit out of + // full register size and all registers are at min 32-bit wide. + if (!TII->isSDWA(*OtherInst)) + break; + + SdwaSel DstSel = static_cast<SdwaSel>( + TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));; + SdwaSel OtherDstSel = static_cast<SdwaSel>( + TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel)); + + bool DstSelAgree = false; + switch (DstSel) { + case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || + (OtherDstSel == BYTE_3) || + (OtherDstSel == WORD_1)); + break; + case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || + (OtherDstSel == BYTE_1) || + (OtherDstSel == WORD_0)); + break; + case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || + (OtherDstSel == BYTE_2) || + (OtherDstSel == BYTE_3) || + (OtherDstSel == WORD_1)); + break; + case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || + (OtherDstSel == BYTE_2) || + (OtherDstSel == BYTE_3) || + (OtherDstSel == WORD_1)); + break; + case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || + (OtherDstSel == BYTE_1) || + (OtherDstSel == BYTE_3) || + (OtherDstSel == WORD_0)); + break; + case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || + (OtherDstSel == BYTE_1) || + (OtherDstSel == BYTE_2) || + (OtherDstSel == WORD_0)); + break; + default: DstSelAgree = false; + } + + if (!DstSelAgree) + break; + + // Also OtherInst dst_unused should be UNUSED_PAD + DstUnused OtherDstUnused = static_cast<DstUnused>( + TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused)); + if (OtherDstUnused != DstUnused::UNUSED_PAD) + break; + + // Create DstPreserveOperand + MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + assert(OrDst && OrDst->isReg()); + + return make_unique<SDWADstPreserveOperand>( + OrDst, OrSDWADef, OrOtherDef, DstSel); + + } + } + + return std::unique_ptr<SDWAOperand>(nullptr); +} + +void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) { + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (auto Operand = matchSDWAOperand(MI)) { + DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n'); + SDWAOperands[&MI] = std::move(Operand); ++NumSDWAPatternsFound; - break; - } } } } @@ -609,12 +818,16 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) { bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const { + // Check if this is already an SDWA instruction + unsigned Opc = MI.getOpcode(); + if (TII->isSDWA(Opc)) + return true; + // Check if this instruction has opcode that supports SDWA - int Opc = MI.getOpcode(); if (AMDGPU::getSDWAOp(Opc) == -1) Opc = AMDGPU::getVOPe32(Opc); - if (Opc == -1 || AMDGPU::getSDWAOp(Opc) == -1) + if (AMDGPU::getSDWAOp(Opc) == -1) return false; if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) @@ -647,9 +860,15 @@ bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI, bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands) { // Convert to sdwa - int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode()); - if (SDWAOpcode == -1) - SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(MI.getOpcode())); + int SDWAOpcode; + unsigned Opcode = MI.getOpcode(); + if (TII->isSDWA(Opcode)) { + SDWAOpcode = Opcode; + } else { + SDWAOpcode = AMDGPU::getSDWAOp(Opcode); + if (SDWAOpcode == -1) + SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode)); + } assert(SDWAOpcode != -1); const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); @@ -725,25 +944,44 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, } } - // Initialize dst_sel if present + // Copy dst_sel if present, initialize otherwise if needed if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) { - SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); + if (DstSel) { + SDWAInst.add(*DstSel); + } else { + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + } } - // Initialize dst_unused if present + // Copy dst_unused if present, initialize otherwise if needed if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) { - SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); + MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); + if (DstUnused) { + SDWAInst.add(*DstUnused); + } else { + SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); + } } - // Initialize src0_sel + // Copy src0_sel if present, initialize otherwise assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1); - SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); - + MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); + if (Src0Sel) { + SDWAInst.add(*Src0Sel); + } else { + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + } - // Initialize src1_sel if present + // Copy src1_sel if present, initialize otherwise if needed if (Src1) { assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1); - SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); + if (Src1Sel) { + SDWAInst.add(*Src1Sel); + } else { + SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); + } } // Apply all sdwa operand pattenrs @@ -782,7 +1020,7 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const { const MCInstrDesc &Desc = TII->get(MI.getOpcode()); unsigned ConstantBusCount = 0; - for (MachineOperand &Op: MI.explicit_uses()) { + for (MachineOperand &Op : MI.explicit_uses()) { if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) continue; @@ -812,7 +1050,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - if (!ST.hasSDWA()) + if (!ST.hasSDWA() || skipFunction(MF.getFunction())) return false; MRI = &MF.getRegInfo(); @@ -820,27 +1058,35 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { TII = ST.getInstrInfo(); // Find all SDWA operands in MF. - matchSDWAOperands(MF); + bool Changed = false; + bool Ret = false; + do { + matchSDWAOperands(MF); + + for (const auto &OperandPair : SDWAOperands) { + const auto &Operand = OperandPair.second; + MachineInstr *PotentialMI = Operand->potentialToConvert(TII); + if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { + PotentialMatches[PotentialMI].push_back(Operand.get()); + } + } - for (const auto &OperandPair : SDWAOperands) { - const auto &Operand = OperandPair.second; - MachineInstr *PotentialMI = Operand->potentialToConvert(TII); - if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { - PotentialMatches[PotentialMI].push_back(Operand.get()); + for (auto &PotentialPair : PotentialMatches) { + MachineInstr &PotentialMI = *PotentialPair.first; + convertToSDWA(PotentialMI, PotentialPair.second); } - } - for (auto &PotentialPair : PotentialMatches) { - MachineInstr &PotentialMI = *PotentialPair.first; - convertToSDWA(PotentialMI, PotentialPair.second); - } + PotentialMatches.clear(); + SDWAOperands.clear(); + + Changed = !ConvertedInstructions.empty(); - PotentialMatches.clear(); - SDWAOperands.clear(); + if (Changed) + Ret = true; - bool Ret = !ConvertedInstructions.empty(); - while (!ConvertedInstructions.empty()) - legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); + while (!ConvertedInstructions.empty()) + legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); + } while (Changed); return Ret; } diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 4a3fbb4593bb..65cdc13e03cd 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -148,7 +148,6 @@ unsigned SIRegisterInfo::reservedStackPtrOffsetReg( BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); - Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); // EXEC_LO and EXEC_HI could be allocated and used as regular register, but // this seems likely to result in bugs, so I'm marking them as reserved. @@ -173,6 +172,8 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); + reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13); + reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15); const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); @@ -237,8 +238,15 @@ bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const return true; } -bool SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const { - return MF.getFrameInfo().hasStackObjects(); +bool SIRegisterInfo::requiresFrameIndexScavenging( + const MachineFunction &MF) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.hasStackObjects()) + return true; + + // May need to deal with callee saved registers. + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + return !Info->isEntryFunction(); } bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( @@ -429,6 +437,10 @@ static int getOffsetMUBUFStore(unsigned Opc) { return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET; case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN: return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET; + case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN: + return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET; + case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN: + return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET; default: return -1; } @@ -450,6 +462,18 @@ static int getOffsetMUBUFLoad(unsigned Opc) { return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET; case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN: return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET; + case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN: + return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET; + case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN: + return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET; + case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN: + return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET; + case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: + return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET; + case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN: + return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET; + case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN: + return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET; default: return -1; } @@ -472,17 +496,21 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, if (LoadStoreOp == -1) return false; - unsigned Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata)->getReg(); - - BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) - .addReg(Reg, getDefRegState(!IsStore)) - .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) - .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); + MachineInstrBuilder NewMI = BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) + .add(*Reg) + .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + + const MachineOperand *VDataIn = TII->getNamedOperand(*MI, + AMDGPU::OpName::vdata_in); + if (VDataIn) + NewMI.add(*VDataIn); return true; } @@ -1045,8 +1073,6 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, .addImm(Log2_32(ST.getWavefrontSize())) .addReg(DiffReg); } else { - unsigned CarryOut - = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); unsigned ScaledReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -1056,8 +1082,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, // TODO: Fold if use instruction is another add of a constant. if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) { - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg) - .addReg(CarryOut, RegState::Define | RegState::Dead) + TII->getAddNoCarry(*MBB, MI, DL, ResultReg) .addImm(Offset) .addReg(ScaledReg, RegState::Kill); } else { @@ -1066,13 +1091,10 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg) .addImm(Offset); - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg) - .addReg(CarryOut, RegState::Define | RegState::Dead) + TII->getAddNoCarry(*MBB, MI, DL, ResultReg) .addReg(ConstOffsetReg, RegState::Kill) .addReg(ScaledReg, RegState::Kill); } - - MRI.setRegAllocationHint(CarryOut, 0, AMDGPU::VCC); } // Don't introduce an extra copy if we're just materializing in a mov. @@ -1275,8 +1297,7 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( return RC; // We can assume that each lane corresponds to one 32-bit register. - LaneBitmask::Type Mask = getSubRegIndexLaneMask(SubIdx).getAsInteger(); - unsigned Count = countPopulation(Mask); + unsigned Count = getSubRegIndexLaneMask(SubIdx).getNumLanes(); if (isSGPRClass(RC)) { switch (Count) { case 1: @@ -1322,73 +1343,18 @@ bool SIRegisterInfo::shouldRewriteCopySrc( // class. // // e.g. if we have something like - // vreg0 = ... - // vreg1 = ... - // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2 - // vreg3 = COPY vreg2, sub0 + // %0 = ... + // %1 = ... + // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2 + // %3 = COPY %2, sub0 // // We want to look through the COPY to find: - // => vreg3 = COPY vreg0 + // => %3 = COPY %0 // Plain copy. return getCommonSubClass(DefRC, SrcRC) != nullptr; } -// FIXME: Most of these are flexible with HSA and we don't need to reserve them -// as input registers if unused. Whether the dispatch ptr is necessary should be -// easy to detect from used intrinsics. Scratch setup is harder to know. -unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, - enum PreloadedValue Value) const { - - const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - (void)ST; - switch (Value) { - case SIRegisterInfo::WORKGROUP_ID_X: - assert(MFI->hasWorkGroupIDX()); - return MFI->WorkGroupIDXSystemSGPR; - case SIRegisterInfo::WORKGROUP_ID_Y: - assert(MFI->hasWorkGroupIDY()); - return MFI->WorkGroupIDYSystemSGPR; - case SIRegisterInfo::WORKGROUP_ID_Z: - assert(MFI->hasWorkGroupIDZ()); - return MFI->WorkGroupIDZSystemSGPR; - case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: - return MFI->PrivateSegmentWaveByteOffsetSystemSGPR; - case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: - assert(MFI->hasPrivateSegmentBuffer()); - return MFI->PrivateSegmentBufferUserSGPR; - case SIRegisterInfo::IMPLICIT_BUFFER_PTR: - assert(MFI->hasImplicitBufferPtr()); - return MFI->ImplicitBufferPtrUserSGPR; - case SIRegisterInfo::KERNARG_SEGMENT_PTR: - assert(MFI->hasKernargSegmentPtr()); - return MFI->KernargSegmentPtrUserSGPR; - case SIRegisterInfo::DISPATCH_ID: - assert(MFI->hasDispatchID()); - return MFI->DispatchIDUserSGPR; - case SIRegisterInfo::FLAT_SCRATCH_INIT: - assert(MFI->hasFlatScratchInit()); - return MFI->FlatScratchInitUserSGPR; - case SIRegisterInfo::DISPATCH_PTR: - assert(MFI->hasDispatchPtr()); - return MFI->DispatchPtrUserSGPR; - case SIRegisterInfo::QUEUE_PTR: - assert(MFI->hasQueuePtr()); - return MFI->QueuePtrUserSGPR; - case SIRegisterInfo::WORKITEM_ID_X: - assert(MFI->hasWorkItemIDX()); - return AMDGPU::VGPR0; - case SIRegisterInfo::WORKITEM_ID_Y: - assert(MFI->hasWorkItemIDY()); - return AMDGPU::VGPR1; - case SIRegisterInfo::WORKITEM_ID_Z: - assert(MFI->hasWorkItemIDZ()); - return AMDGPU::VGPR2; - } - llvm_unreachable("unexpected preloaded value type"); -} - /// \brief Returns a register that is not used at any point in the function. /// If all registers are used, then this function will return // AMDGPU::NoRegister. @@ -1525,7 +1491,8 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI, unsigned SubReg, const TargetRegisterClass *DstRC, unsigned DstSubReg, - const TargetRegisterClass *NewRC) const { + const TargetRegisterClass *NewRC, + LiveIntervals &LIS) const { unsigned SrcSize = getRegSizeInBits(*SrcRC); unsigned DstSize = getRegSizeInBits(*DstRC); unsigned NewSize = getRegSizeInBits(*NewRC); @@ -1547,7 +1514,7 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(), - *MF.getFunction()); + MF.getFunction()); switch (RC->getID()) { default: return AMDGPURegisterInfo::getRegPressureLimit(RC, MF); diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index 600cc886cb59..bf814b6974a8 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -22,6 +22,7 @@ namespace llvm { +class LiveIntervals; class MachineRegisterInfo; class SISubtarget; class SIMachineFunctionInfo; @@ -63,6 +64,7 @@ public: BitVector getReservedRegs(const MachineFunction &MF) const override; const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; @@ -185,31 +187,6 @@ public: OpType <= AMDGPU::OPERAND_SRC_LAST; } - enum PreloadedValue { - // SGPRS: - PRIVATE_SEGMENT_BUFFER = 0, - DISPATCH_PTR = 1, - QUEUE_PTR = 2, - KERNARG_SEGMENT_PTR = 3, - DISPATCH_ID = 4, - FLAT_SCRATCH_INIT = 5, - WORKGROUP_ID_X = 10, - WORKGROUP_ID_Y = 11, - WORKGROUP_ID_Z = 12, - PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, - IMPLICIT_BUFFER_PTR = 15, - - // VGPRS: - FIRST_VGPR_VALUE = 16, - WORKITEM_ID_X = FIRST_VGPR_VALUE, - WORKITEM_ID_Y = 17, - WORKITEM_ID_Z = 18 - }; - - /// \brief Returns the physical register that \p Value is stored in. - unsigned getPreloadedValue(const MachineFunction &MF, - enum PreloadedValue Value) const; - unsigned findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, const MachineFunction &MF) const; @@ -236,7 +213,8 @@ public: unsigned SubReg, const TargetRegisterClass *DstRC, unsigned DstSubReg, - const TargetRegisterClass *NewRC) const override; + const TargetRegisterClass *NewRC, + LiveIntervals &LIS) const override; unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index d097b78890e3..6b7c3ffb7bb8 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -77,18 +77,11 @@ def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>, let HWEncoding = 110; } -def TTMP0 : SIReg <"ttmp0", 112>; -def TTMP1 : SIReg <"ttmp1", 113>; -def TTMP2 : SIReg <"ttmp2", 114>; -def TTMP3 : SIReg <"ttmp3", 115>; -def TTMP4 : SIReg <"ttmp4", 116>; -def TTMP5 : SIReg <"ttmp5", 117>; -def TTMP6 : SIReg <"ttmp6", 118>; -def TTMP7 : SIReg <"ttmp7", 119>; -def TTMP8 : SIReg <"ttmp8", 120>; -def TTMP9 : SIReg <"ttmp9", 121>; -def TTMP10 : SIReg <"ttmp10", 122>; -def TTMP11 : SIReg <"ttmp11", 123>; +foreach Index = 0-15 in { + def TTMP#Index#_vi : SIReg<"ttmp"#Index, !add(112, Index)>; + def TTMP#Index#_gfx9 : SIReg<"ttmp"#Index, !add(108, Index)>; + def TTMP#Index : SIReg<"", 0>; +} multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> { def _ci : SIReg<n, ci_e>; @@ -192,7 +185,7 @@ def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, // Trap handler TMP 32-bit registers def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32, - (add (sequence "TTMP%u", 0, 11))> { + (add (sequence "TTMP%u", 0, 15))> { let isAllocatable = 0; } @@ -208,6 +201,36 @@ def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3], (add (decimate (shl TTMP_32, 2), 4)), (add (decimate (shl TTMP_32, 3), 4))]>; +class TmpRegTuples <string tgt, + bit Is64Bit, + int Index0, + int Index1 = !add(Index0, 1), + int Index2 = !add(Index0, !if(Is64Bit, 1, 2)), + int Index3 = !add(Index0, !if(Is64Bit, 1, 3)), + string name = "ttmp["#Index0#":"#Index3#"]", + Register r0 = !cast<Register>("TTMP"#Index0#tgt), + Register r1 = !cast<Register>("TTMP"#Index1#tgt), + Register r2 = !cast<Register>("TTMP"#Index2#tgt), + Register r3 = !cast<Register>("TTMP"#Index3#tgt)> : + RegisterWithSubRegs<name, !if(Is64Bit, [r0, r1], [r0, r1, r2, r3])> { + let SubRegIndices = !if(Is64Bit, [sub0, sub1], [sub0, sub1, sub2, sub3]); + let HWEncoding = r0.HWEncoding; +} + +foreach Index = {0, 2, 4, 6, 8, 10, 12, 14} in { + def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 1, Index>; + def TTMP#Index#_TTMP#!add(Index,1)#_gfx9 : TmpRegTuples<"_gfx9", 1, Index>; +} + +foreach Index = {0, 4, 8, 12} in { + def TTMP#Index#_TTMP#!add(Index,1)# + _TTMP#!add(Index,2)# + _TTMP#!add(Index,3)#_vi : TmpRegTuples<"_vi", 0, Index>; + def TTMP#Index#_TTMP#!add(Index,1)# + _TTMP#!add(Index,2)# + _TTMP#!add(Index,3)#_gfx9 : TmpRegTuples<"_gfx9", 0, Index>; +} + // VGPR 32-bit registers // i16/f16 only on VI+ def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, @@ -269,6 +292,18 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, // Register classes used as source and destination //===----------------------------------------------------------------------===// +def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, + (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> { + let isAllocatable = 0; + let CopyCost = -1; +} + +def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64], 32, + (add PRIVATE_RSRC_REG)> { + let isAllocatable = 0; + let CopyCost = -1; +} + // Subset of SReg_32 without M0 for SMRD instructions and alike. // See comments in SIInstructions.td for more info. def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, @@ -278,6 +313,11 @@ def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f1 let AllocationPriority = 7; } +def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, + (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> { + let AllocationPriority = 7; +} + def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> { let AllocationPriority = 7; @@ -285,7 +325,7 @@ def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32 // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI)> { + (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> { let AllocationPriority = 7; } @@ -466,6 +506,8 @@ defm SSrc : RegImmOperand<"SReg", "SSrc">; defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ; +def SCSrc_i1 : RegisterOperand<SReg_64_XEXEC>; + //===----------------------------------------------------------------------===// // VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 874fbadca7f3..41f989ad3228 100644 --- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -286,7 +286,7 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) { } bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; MachineRegisterInfo &MRI = MF.getRegInfo(); diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp index a613a220e29d..53aefe829737 100644 --- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -9,7 +9,7 @@ // /// \file /// \brief This pass adds instructions to enable whole quad mode for pixel -/// shaders. +/// shaders, and whole wavefront mode for all programs. /// /// Whole quad mode is required for derivative computations, but it interferes /// with shader side effects (stores and atomics). This pass is run on the @@ -29,6 +29,13 @@ /// ... /// S_MOV_B64 EXEC, Tmp /// +/// We also compute when a sequence of instructions requires Whole Wavefront +/// Mode (WWM) and insert instructions to save and restore it: +/// +/// S_OR_SAVEEXEC_B64 Tmp, -1 +/// ... +/// S_MOV_B64 EXEC, Tmp +/// /// In order to avoid excessive switching during sequences of Exact /// instructions, the pass first analyzes which instructions must be run in WQM /// (aka which instructions produce values that lead to derivative @@ -54,10 +61,11 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/LiveInterval.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -66,13 +74,13 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SlotIndexes.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DebugLoc.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetRegisterInfo.h" #include <cassert> #include <vector> @@ -84,7 +92,8 @@ namespace { enum { StateWQM = 0x1, - StateExact = 0x2, + StateWWM = 0x2, + StateExact = 0x4, }; struct PrintState { @@ -94,20 +103,28 @@ public: explicit PrintState(int State) : State(State) {} }; +#ifndef NDEBUG static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) { if (PS.State & StateWQM) OS << "WQM"; - if (PS.State & StateExact) { + if (PS.State & StateWWM) { if (PS.State & StateWQM) OS << '|'; + OS << "WWM"; + } + if (PS.State & StateExact) { + if (PS.State & (StateWQM | StateWWM)) + OS << '|'; OS << "Exact"; } return OS; } +#endif struct InstrInfo { char Needs = 0; + char Disabled = 0; char OutNeeds = 0; }; @@ -128,6 +145,7 @@ struct WorkItem { class SIWholeQuadMode : public MachineFunctionPass { private: + CallingConv::ID CallingConv; const SIInstrInfo *TII; const SIRegisterInfo *TRI; MachineRegisterInfo *MRI; @@ -136,12 +154,14 @@ private: DenseMap<const MachineInstr *, InstrInfo> Instructions; DenseMap<MachineBasicBlock *, BlockInfo> Blocks; SmallVector<MachineInstr *, 1> LiveMaskQueries; + SmallVector<MachineInstr *, 4> LowerToCopyInstrs; void printInfo(); void markInstruction(MachineInstr &MI, char Flag, std::vector<WorkItem> &Worklist); - void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist); + void markInstructionUses(const MachineInstr &MI, char Flag, + std::vector<WorkItem> &Worklist); char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist); void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist); void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist); @@ -159,9 +179,14 @@ private: unsigned SaveWQM, unsigned LiveMaskReg); void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, unsigned SavedWQM); + void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, + unsigned SaveOrig); + void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, + unsigned SavedOrig); void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); void lowerLiveMaskQueries(unsigned LiveMaskReg); + void lowerCopyInstrs(); public: static char ID; @@ -196,9 +221,11 @@ FunctionPass *llvm::createSIWholeQuadModePass() { return new SIWholeQuadMode; } -void SIWholeQuadMode::printInfo() { +#ifndef NDEBUG +LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() { for (const auto &BII : Blocks) { - dbgs() << "\nBB#" << BII.first->getNumber() << ":\n" + dbgs() << "\n" + << printMBBReference(*BII.first) << ":\n" << " InNeeds = " << PrintState(BII.second.InNeeds) << ", Needs = " << PrintState(BII.second.Needs) << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n"; @@ -213,27 +240,32 @@ void SIWholeQuadMode::printInfo() { } } } +#endif void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag, std::vector<WorkItem> &Worklist) { InstrInfo &II = Instructions[&MI]; - assert(Flag == StateWQM || Flag == StateExact); + assert(!(Flag & StateExact) && Flag != 0); - // Ignore if the instruction is already marked. The typical case is that we - // mark an instruction WQM multiple times, but for atomics it can happen that - // Flag is StateWQM, but Needs is already set to StateExact. In this case, - // letting the atomic run in StateExact is correct as per the relevant specs. - if (II.Needs) + // Remove any disabled states from the flag. The user that required it gets + // an undefined value in the helper lanes. For example, this can happen if + // the result of an atomic is used by instruction that requires WQM, where + // ignoring the request for WQM is correct as per the relevant specs. + Flag &= ~II.Disabled; + + // Ignore if the flag is already encompassed by the existing needs, or we + // just disabled everything. + if ((II.Needs & Flag) == Flag) return; - II.Needs = Flag; + II.Needs |= Flag; Worklist.push_back(&MI); } -/// Mark all instructions defining the uses in \p MI as WQM. -void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI, - std::vector<WorkItem> &Worklist) { +/// Mark all instructions defining the uses in \p MI with \p Flag. +void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, + std::vector<WorkItem> &Worklist) { for (const MachineOperand &Use : MI.uses()) { if (!Use.isReg() || !Use.isUse()) continue; @@ -258,7 +290,7 @@ void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI, if (Value->isPHIDef()) continue; - markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM, + markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag, Worklist); } @@ -266,7 +298,7 @@ void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI, } for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) - markInstruction(DefMI, StateWQM, Worklist); + markInstruction(DefMI, Flag, Worklist); } } @@ -275,27 +307,72 @@ void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI, char SIWholeQuadMode::scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist) { char GlobalFlags = 0; - bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs"); - - for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { - MachineBasicBlock &MBB = *BI; + bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs"); + SmallVector<MachineInstr *, 4> SetInactiveInstrs; + + // We need to visit the basic blocks in reverse post-order so that we visit + // defs before uses, in particular so that we don't accidentally mark an + // instruction as needing e.g. WQM before visiting it and realizing it needs + // WQM disabled. + ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); + for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) { + MachineBasicBlock &MBB = **BI; + BlockInfo &BBI = Blocks[&MBB]; for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { MachineInstr &MI = *II; + InstrInfo &III = Instructions[&MI]; unsigned Opcode = MI.getOpcode(); char Flags = 0; - if (TII->isDS(Opcode)) { + if (TII->isDS(Opcode) && CallingConv == CallingConv::AMDGPU_PS) { Flags = StateWQM; } else if (TII->isWQM(Opcode)) { // Sampling instructions don't need to produce results for all pixels // in a quad, they just require all inputs of a quad to have been // computed for derivatives. - markUsesWQM(MI, Worklist); + markInstructionUses(MI, StateWQM, Worklist); GlobalFlags |= StateWQM; continue; + } else if (Opcode == AMDGPU::WQM) { + // The WQM intrinsic requires its output to have all the helper lanes + // correct, so we need it to be in WQM. + Flags = StateWQM; + LowerToCopyInstrs.push_back(&MI); + } else if (Opcode == AMDGPU::WWM) { + // The WWM intrinsic doesn't make the same guarantee, and plus it needs + // to be executed in WQM or Exact so that its copy doesn't clobber + // inactive lanes. + markInstructionUses(MI, StateWWM, Worklist); + GlobalFlags |= StateWWM; + LowerToCopyInstrs.push_back(&MI); + continue; + } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 || + Opcode == AMDGPU::V_SET_INACTIVE_B64) { + III.Disabled = StateWWM; + MachineOperand &Inactive = MI.getOperand(2); + if (Inactive.isReg()) { + if (Inactive.isUndef()) { + LowerToCopyInstrs.push_back(&MI); + } else { + unsigned Reg = Inactive.getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + for (MachineInstr &DefMI : MRI->def_instructions(Reg)) + markInstruction(DefMI, StateWWM, Worklist); + } + } + } + SetInactiveInstrs.push_back(&MI); + continue; } else if (TII->isDisableWQM(MI)) { - Flags = StateExact; + BBI.Needs |= StateExact; + if (!(BBI.InNeeds & StateExact)) { + BBI.InNeeds |= StateExact; + Worklist.push_back(&MBB); + } + GlobalFlags |= StateExact; + III.Disabled = StateWQM | StateWWM; + continue; } else { if (Opcode == AMDGPU::SI_PS_LIVE) { LiveMaskQueries.push_back(&MI); @@ -326,6 +403,14 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, } } + // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is + // ever used anywhere in the function. This implements the corresponding + // semantics of @llvm.amdgcn.set.inactive. + if (GlobalFlags & StateWQM) { + for (MachineInstr *MI : SetInactiveInstrs) + markInstruction(*MI, StateWQM, Worklist); + } + return GlobalFlags; } @@ -337,22 +422,24 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, // Control flow-type instructions and stores to temporary memory that are // followed by WQM computations must themselves be in WQM. - if ((II.OutNeeds & StateWQM) && !II.Needs && + if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) && (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) { Instructions[&MI].Needs = StateWQM; II.Needs = StateWQM; } // Propagate to block level - BI.Needs |= II.Needs; - if ((BI.InNeeds | II.Needs) != BI.InNeeds) { - BI.InNeeds |= II.Needs; - Worklist.push_back(MBB); + if (II.Needs & StateWQM) { + BI.Needs |= StateWQM; + if (!(BI.InNeeds & StateWQM)) { + BI.InNeeds |= StateWQM; + Worklist.push_back(MBB); + } } // Propagate backwards within block if (MachineInstr *PrevMI = MI.getPrevNode()) { - char InNeeds = II.Needs | II.OutNeeds; + char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds; if (!PrevMI->isPHI()) { InstrInfo &PrevII = Instructions[PrevMI]; if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { @@ -363,10 +450,10 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, } // Propagate WQM flag to instruction inputs - assert(II.Needs != (StateWQM | StateExact)); + assert(!(II.Needs & StateExact)); - if (II.Needs == StateWQM) - markUsesWQM(MI, Worklist); + if (II.Needs != 0) + markInstructionUses(MI, II.Needs, Worklist); } void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, @@ -558,6 +645,29 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, LIS->InsertMachineInstrInMaps(*MI); } +void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + unsigned SaveOrig) { + MachineInstr *MI; + + assert(SaveOrig); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64), + SaveOrig) + .addImm(-1); + LIS->InsertMachineInstrInMaps(*MI); +} + +void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + unsigned SavedOrig) { + MachineInstr *MI; + + assert(SavedOrig); + MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC) + .addReg(SavedOrig); + LIS->InsertMachineInstrInMaps(*MI); +} + void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry) { auto BII = Blocks.find(&MBB); @@ -566,45 +676,66 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, const BlockInfo &BI = BII->second; - if (!(BI.InNeeds & StateWQM)) - return; - // This is a non-entry block that is WQM throughout, so no need to do // anything. - if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact) + if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) return; - DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n"); + DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB) << ":\n"); unsigned SavedWQMReg = 0; + unsigned SavedNonWWMReg = 0; bool WQMFromExec = isEntry; - char State = isEntry ? StateExact : StateWQM; + char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM; + char NonWWMState = 0; auto II = MBB.getFirstNonPHI(), IE = MBB.end(); if (isEntry) ++II; // Skip the instruction that saves LiveMask - MachineBasicBlock::iterator First = IE; + // This stores the first instruction where it's safe to switch from WQM to + // Exact or vice versa. + MachineBasicBlock::iterator FirstWQM = IE; + + // This stores the first instruction where it's safe to switch from WWM to + // Exact/WQM or to switch to WWM. It must always be the same as, or after, + // FirstWQM since if it's safe to switch to/from WWM, it must be safe to + // switch to/from WQM as well. + MachineBasicBlock::iterator FirstWWM = IE; for (;;) { MachineBasicBlock::iterator Next = II; - char Needs = 0; + char Needs = StateExact | StateWQM; // WWM is disabled by default char OutNeeds = 0; - if (First == IE) - First = II; + if (FirstWQM == IE) + FirstWQM = II; + + if (FirstWWM == IE) + FirstWWM = II; + // First, figure out the allowed states (Needs) based on the propagated + // flags. if (II != IE) { MachineInstr &MI = *II; if (requiresCorrectState(MI)) { auto III = Instructions.find(&MI); if (III != Instructions.end()) { - Needs = III->second.Needs; + if (III->second.Needs & StateWWM) + Needs = StateWWM; + else if (III->second.Needs & StateWQM) + Needs = StateWQM; + else + Needs &= ~III->second.Disabled; OutNeeds = III->second.OutNeeds; } + } else { + // If the instruction doesn't actually need a correct EXEC, then we can + // safely leave WWM enabled. + Needs = StateExact | StateWQM | StateWWM; } - if (MI.isTerminator() && !Needs && OutNeeds == StateExact) + if (MI.isTerminator() && OutNeeds == StateExact) Needs = StateExact; if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact) @@ -617,20 +748,45 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, Needs = StateWQM; else if (BI.OutNeeds == StateExact) Needs = StateExact; + else + Needs = StateWQM | StateExact; } - if (Needs) { - if (Needs != State) { - MachineBasicBlock::iterator Before = - prepareInsertion(MBB, First, II, Needs == StateWQM, - Needs == StateExact || WQMFromExec); + // Now, transition if necessary. + if (!(Needs & State)) { + MachineBasicBlock::iterator First; + if (State == StateWWM || Needs == StateWWM) { + // We must switch to or from WWM + First = FirstWWM; + } else { + // We only need to switch to/from WQM, so we can use FirstWQM + First = FirstWQM; + } - if (Needs == StateExact) { + MachineBasicBlock::iterator Before = + prepareInsertion(MBB, First, II, Needs == StateWQM, + Needs == StateExact || WQMFromExec); + + if (State == StateWWM) { + assert(SavedNonWWMReg); + fromWWM(MBB, Before, SavedNonWWMReg); + State = NonWWMState; + } + + if (Needs == StateWWM) { + NonWWMState = State; + SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + toWWM(MBB, Before, SavedNonWWMReg); + State = StateWWM; + } else { + if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) { if (!WQMFromExec && (OutNeeds & StateWQM)) SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); toExact(MBB, Before, SavedWQMReg, LiveMaskReg); - } else { + State = StateExact; + } else if (State == StateExact && (Needs & StateWQM) && + !(Needs & StateExact)) { assert(WQMFromExec == (SavedWQMReg == 0)); toWQM(MBB, Before, SavedWQMReg); @@ -639,12 +795,19 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, LIS->createAndComputeVirtRegInterval(SavedWQMReg); SavedWQMReg = 0; } + State = StateWQM; + } else { + // We can get here if we transitioned from WWM to a non-WWM state that + // already matches our needs, but we shouldn't need to do anything. + assert(Needs & State); } - - State = Needs; } + } - First = IE; + if (Needs != (StateExact | StateWQM | StateWWM)) { + if (Needs != (StateExact | StateWQM)) + FirstWQM = IE; + FirstWWM = IE; } if (II == IE) @@ -666,13 +829,20 @@ void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { } } -bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { - if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS) - return false; +void SIWholeQuadMode::lowerCopyInstrs() { + for (MachineInstr *MI : LowerToCopyInstrs) { + for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--) + MI->RemoveOperand(i); + MI->setDesc(TII->get(AMDGPU::COPY)); + } +} +bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { Instructions.clear(); Blocks.clear(); LiveMaskQueries.clear(); + LowerToCopyInstrs.clear(); + CallingConv = MF.getFunction().getCallingConv(); const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); @@ -682,14 +852,13 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { LIS = &getAnalysis<LiveIntervals>(); char GlobalFlags = analyzeFunction(MF); + unsigned LiveMaskReg = 0; if (!(GlobalFlags & StateWQM)) { lowerLiveMaskQueries(AMDGPU::EXEC); - return !LiveMaskQueries.empty(); - } - - // Store a copy of the original live mask when required - unsigned LiveMaskReg = 0; - { + if (!(GlobalFlags & StateWWM)) + return !LiveMaskQueries.empty(); + } else { + // Store a copy of the original live mask when required MachineBasicBlock &Entry = MF.front(); MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); @@ -701,13 +870,15 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { LIS->InsertMachineInstrInMaps(*MI); } + lowerLiveMaskQueries(LiveMaskReg); + if (GlobalFlags == StateWQM) { // For a shader that needs only WQM, we can just set it once. BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC) .addReg(AMDGPU::EXEC); - lowerLiveMaskQueries(LiveMaskReg); + lowerCopyInstrs(); // EntryMI may become invalid here return true; } @@ -715,7 +886,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { DEBUG(printInfo()); - lowerLiveMaskQueries(LiveMaskReg); + lowerCopyInstrs(); // Handle the general case for (auto BII : Blocks) diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td index 73dd8b7daa4e..8f347986eb8a 100644 --- a/lib/Target/AMDGPU/SMInstructions.td +++ b/lib/Target/AMDGPU/SMInstructions.td @@ -129,11 +129,8 @@ class SM_Time_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo< opName, (outs SReg_64_XEXEC:$sdst), (ins), " $sdst", [(set i64:$sdst, (node))]> { let hasSideEffects = 1; - // FIXME: mayStore = ? is a workaround for tablegen bug for different - // inferred mayStore flags for the instruction pattern vs. standalone - // Pat. Each considers the other contradictory. - let mayStore = ?; - let mayLoad = ?; + let mayStore = 0; + let mayLoad = 1; let has_sbase = 0; let has_offset = 0; } @@ -239,27 +236,24 @@ def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">; def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">; def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">; def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">; -def SMRDBufferSgpr : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">; - -let Predicates = [isGCN] in { multiclass SMRD_Pattern <string Instr, ValueType vt> { // 1. IMM offset - def : Pat < + def : GCNPat < (smrd_load (SMRDImm i64:$sbase, i32:$offset)), (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0)) >; // 2. SGPR offset - def : Pat < + def : GCNPat < (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)), (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0)) >; } -let Predicates = [isSICI] in { -def : Pat < +let OtherPredicates = [isSICI] in { +def : GCNPat < (i64 (readcyclecounter)), (S_MEMTIME) >; @@ -277,29 +271,27 @@ defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>; defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; // 1. Offset as an immediate -def SM_LOAD_PATTERN : Pat < // name this pattern to reuse AddedComplexity on CI +def SM_LOAD_PATTERN : GCNPat < // name this pattern to reuse AddedComplexity on CI (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)), (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, 0) >; // 2. Offset loaded in an 32bit SGPR -def : Pat < - (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)), +def : GCNPat < + (SIload_constant v4i32:$sbase, i32:$offset), (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, 0) >; } // End let AddedComplexity = 100 -} // let Predicates = [isGCN] - -let Predicates = [isVI] in { +let OtherPredicates = [isVI] in { -def : Pat < +def : GCNPat < (i64 (readcyclecounter)), (S_MEMREALTIME) >; -} // let Predicates = [isVI] +} // let OtherPredicates = [isVI] //===----------------------------------------------------------------------===// @@ -508,10 +500,10 @@ def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>; let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity in { -class SMRD_Pattern_ci <string Instr, ValueType vt> : Pat < +class SMRD_Pattern_ci <string Instr, ValueType vt> : GCNPat < (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), (vt (!cast<SM_Pseudo>(Instr#"_IMM_ci") $sbase, $offset, 0))> { - let Predicates = [isCIOnly]; + let OtherPredicates = [isCIOnly]; } def : SMRD_Pattern_ci <"S_LOAD_DWORD", i32>; @@ -520,10 +512,10 @@ def : SMRD_Pattern_ci <"S_LOAD_DWORDX4", v4i32>; def : SMRD_Pattern_ci <"S_LOAD_DWORDX8", v8i32>; def : SMRD_Pattern_ci <"S_LOAD_DWORDX16", v16i32>; -def : Pat < +def : GCNPat < (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)), (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, 0)> { - let Predicates = [isCI]; // should this be isCIOnly? + let OtherPredicates = [isCI]; // should this be isCIOnly? } } // End let AddedComplexity = SM_LOAD_PATTERN.AddedComplexity diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td index ec29a66c8bbb..02a95a4b6f24 100644 --- a/lib/Target/AMDGPU/SOPInstructions.td +++ b/lib/Target/AMDGPU/SOPInstructions.td @@ -139,7 +139,9 @@ let Defs = [SCC] in { [(set i64:$sdst, (not i64:$src0))] >; def S_WQM_B32 : SOP1_32 <"s_wqm_b32">; - def S_WQM_B64 : SOP1_64 <"s_wqm_b64">; + def S_WQM_B64 : SOP1_64 <"s_wqm_b64", + [(set i1:$sdst, (int_amdgcn_wqm_vote i1:$src0))] + >; } // End Defs = [SCC] @@ -159,10 +161,11 @@ def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64">; def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">; def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">; +def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64">; + def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32", - [(set i32:$sdst, (cttz_zero_undef i32:$src0))] + [(set i32:$sdst, (AMDGPUffbl_b32 i32:$src0))] >; -def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64">; def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32", [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))] @@ -391,6 +394,14 @@ def S_XOR_B32 : SOP2_32 <"s_xor_b32", def S_XOR_B64 : SOP2_64 <"s_xor_b64", [(set i64:$sdst, (xor i64:$src0, i64:$src1))] >; + +def S_XNOR_B32 : SOP2_32 <"s_xnor_b32", + [(set i32:$sdst, (not (xor_oneuse i32:$src0, i32:$src1)))] +>; + +def S_XNOR_B64 : SOP2_64 <"s_xnor_b64", + [(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))] +>; } // End isCommutable = 1 def S_ANDN2_B32 : SOP2_32 <"s_andn2_b32">; @@ -401,8 +412,6 @@ def S_NAND_B32 : SOP2_32 <"s_nand_b32">; def S_NAND_B64 : SOP2_64 <"s_nand_b64">; def S_NOR_B32 : SOP2_32 <"s_nor_b32">; def S_NOR_B64 : SOP2_64 <"s_nor_b64">; -def S_XNOR_B32 : SOP2_32 <"s_xnor_b32">; -def S_XNOR_B64 : SOP2_64 <"s_xnor_b64">; } // End Defs = [SCC] // Use added complexity so these patterns are preferred to the VALU patterns. @@ -811,8 +820,7 @@ def S_CBRANCH_SCC0 : SOPP < >; def S_CBRANCH_SCC1 : SOPP < 0x00000005, (ins sopp_brtarget:$simm16), - "s_cbranch_scc1 $simm16", - [(si_uniform_br_scc SCC, bb:$simm16)] + "s_cbranch_scc1 $simm16" >; } // End Uses = [SCC] @@ -942,12 +950,10 @@ def S_SET_GPR_IDX_MODE : SOPP<0x1d, (ins GPRIdxMode:$simm16), } } -let Predicates = [isGCN] in { - //===----------------------------------------------------------------------===// // S_GETREG_B32 Intrinsic Pattern. //===----------------------------------------------------------------------===// -def : Pat < +def : GCNPat < (int_amdgcn_s_getreg imm:$simm16), (S_GETREG_B32 (as_i16imm $simm16)) >; @@ -956,25 +962,25 @@ def : Pat < // SOP1 Patterns //===----------------------------------------------------------------------===// -def : Pat < +def : GCNPat < (i64 (ctpop i64:$src)), (i64 (REG_SEQUENCE SReg_64, (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0, (S_MOV_B32 (i32 0)), sub1)) >; -def : Pat < +def : GCNPat < (i32 (smax i32:$x, (i32 (ineg i32:$x)))), (S_ABS_I32 $x) >; -def : Pat < +def : GCNPat < (i16 imm:$imm), (S_MOV_B32 imm:$imm) >; // Same as a 32-bit inreg -def : Pat< +def : GCNPat< (i32 (sext i16:$src)), (S_SEXT_I32_I16 $src) >; @@ -986,7 +992,7 @@ def : Pat< // V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector // case, the sgpr-copies pass will fix this to use the vector version. -def : Pat < +def : GCNPat < (i32 (addc i32:$src0, i32:$src1)), (S_ADD_U32 $src0, $src1) >; @@ -994,20 +1000,20 @@ def : Pat < // FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that // REG_SEQUENCE patterns don't support instructions with multiple // outputs. -def : Pat< +def : GCNPat< (i64 (zext i16:$src)), (REG_SEQUENCE SReg_64, (i32 (COPY_TO_REGCLASS (S_AND_B32 $src, (S_MOV_B32 (i32 0xffff))), SGPR_32)), sub0, (S_MOV_B32 (i32 0)), sub1) >; -def : Pat < +def : GCNPat < (i64 (sext i16:$src)), (REG_SEQUENCE SReg_64, (i32 (S_SEXT_I32_I16 $src)), sub0, (i32 (COPY_TO_REGCLASS (S_ASHR_I32 (i32 (S_SEXT_I32_I16 $src)), (S_MOV_B32 (i32 31))), SGPR_32)), sub1) >; -def : Pat< +def : GCNPat< (i32 (zext i16:$src)), (S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src) >; @@ -1018,13 +1024,11 @@ def : Pat< // SOPP Patterns //===----------------------------------------------------------------------===// -def : Pat < +def : GCNPat < (int_amdgcn_s_waitcnt i32:$simm16), (S_WAITCNT (as_i16imm $simm16)) >; -} // End isGCN predicate - //===----------------------------------------------------------------------===// // Real target instructions, move this to the appropriate subtarget TD file diff --git a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp index 92fb762ebd73..f61e2e413ad4 100644 --- a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp +++ b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp @@ -31,7 +31,7 @@ Target &llvm::getTheGCNTarget() { /// \brief Extern function to initialize the targets for the AMDGPU backend extern "C" void LLVMInitializeAMDGPUTargetInfo() { RegisterTarget<Triple::r600, false> R600(getTheAMDGPUTarget(), "r600", - "AMD GPUs HD2XXX-HD6XXX"); + "AMD GPUs HD2XXX-HD6XXX", "AMDGPU"); RegisterTarget<Triple::amdgcn, false> GCN(getTheGCNTarget(), "amdgcn", - "AMD GCN GPUs"); + "AMD GCN GPUs", "AMDGPU"); } diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 67ad904ca972..819a7add0be4 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/Module.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSubtargetInfo.h" @@ -39,7 +40,9 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #define GET_INSTRINFO_NAMED_OPS +#define GET_INSTRMAP_INFO #include "AMDGPUGenInstrInfo.inc" +#undef GET_INSTRMAP_INFO #undef GET_INSTRINFO_NAMED_OPS namespace { @@ -100,15 +103,76 @@ static cl::opt<bool> EnablePackedInlinableLiterals( namespace AMDGPU { +LLVM_READNONE +static inline Channels indexToChannel(unsigned Channel) { + switch (Channel) { + case 1: + return AMDGPU::Channels_1; + case 2: + return AMDGPU::Channels_2; + case 3: + return AMDGPU::Channels_3; + case 4: + return AMDGPU::Channels_4; + default: + llvm_unreachable("invalid MIMG channel"); + } +} + + +// FIXME: Need to handle d16 images correctly. +static unsigned rcToChannels(unsigned RCID) { + switch (RCID) { + case AMDGPU::VGPR_32RegClassID: + return 1; + case AMDGPU::VReg_64RegClassID: + return 2; + case AMDGPU::VReg_96RegClassID: + return 3; + case AMDGPU::VReg_128RegClassID: + return 4; + default: + llvm_unreachable("invalid MIMG register class"); + } +} + +int getMaskedMIMGOp(const MCInstrInfo &MII, unsigned Opc, unsigned NewChannels) { + AMDGPU::Channels Channel = AMDGPU::indexToChannel(NewChannels); + unsigned OrigChannels = rcToChannels(MII.get(Opc).OpInfo[0].RegClass); + if (NewChannels == OrigChannels) + return Opc; + + switch (OrigChannels) { + case 1: + return AMDGPU::getMaskedMIMGOp1(Opc, Channel); + case 2: + return AMDGPU::getMaskedMIMGOp2(Opc, Channel); + case 3: + return AMDGPU::getMaskedMIMGOp3(Opc, Channel); + case 4: + return AMDGPU::getMaskedMIMGOp4(Opc, Channel); + default: + llvm_unreachable("invalid MIMG channel"); + } +} + +// Wrapper for Tablegen'd function. enum Subtarget is not defined in any +// header files, so we need to wrap it in a function that takes unsigned +// instead. +int getMCOpcode(uint16_t Opcode, unsigned Gen) { + return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen)); +} + namespace IsaInfo { IsaVersion getIsaVersion(const FeatureBitset &Features) { - // SI. + // GCN GFX6 (Southern Islands (SI)). if (Features.test(FeatureISAVersion6_0_0)) return {6, 0, 0}; if (Features.test(FeatureISAVersion6_0_1)) return {6, 0, 1}; - // CI. + + // GCN GFX7 (Sea Islands (CI)). if (Features.test(FeatureISAVersion7_0_0)) return {7, 0, 0}; if (Features.test(FeatureISAVersion7_0_1)) @@ -117,8 +181,10 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) { return {7, 0, 2}; if (Features.test(FeatureISAVersion7_0_3)) return {7, 0, 3}; + if (Features.test(FeatureISAVersion7_0_4)) + return {7, 0, 4}; - // VI. + // GCN GFX8 (Volcanic Islands (VI)). if (Features.test(FeatureISAVersion8_0_0)) return {8, 0, 0}; if (Features.test(FeatureISAVersion8_0_1)) @@ -127,26 +193,39 @@ IsaVersion getIsaVersion(const FeatureBitset &Features) { return {8, 0, 2}; if (Features.test(FeatureISAVersion8_0_3)) return {8, 0, 3}; - if (Features.test(FeatureISAVersion8_0_4)) - return {8, 0, 4}; if (Features.test(FeatureISAVersion8_1_0)) return {8, 1, 0}; - // GFX9. + // GCN GFX9. if (Features.test(FeatureISAVersion9_0_0)) return {9, 0, 0}; - if (Features.test(FeatureISAVersion9_0_1)) - return {9, 0, 1}; if (Features.test(FeatureISAVersion9_0_2)) return {9, 0, 2}; - if (Features.test(FeatureISAVersion9_0_3)) - return {9, 0, 3}; if (!Features.test(FeatureGCN) || Features.test(FeatureSouthernIslands)) return {0, 0, 0}; return {7, 0, 0}; } +void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) { + auto TargetTriple = STI->getTargetTriple(); + auto ISAVersion = IsaInfo::getIsaVersion(STI->getFeatureBits()); + + Stream << TargetTriple.getArchName() << '-' + << TargetTriple.getVendorName() << '-' + << TargetTriple.getOSName() << '-' + << TargetTriple.getEnvironmentName() << '-' + << "gfx" + << ISAVersion.Major + << ISAVersion.Minor + << ISAVersion.Stepping; + Stream.flush(); +} + +bool hasCodeObjectV3(const FeatureBitset &Features) { + return Features.test(FeatureCodeObjectV3); +} + unsigned getWavefrontSize(const FeatureBitset &Features) { if (Features.test(FeatureWavefrontSize16)) return 16; @@ -337,16 +416,16 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, Header.private_segment_alignment = 4; } -bool isGroupSegment(const GlobalValue *GV, AMDGPUAS AS) { - return GV->getType()->getAddressSpace() == AS.LOCAL_ADDRESS; +bool isGroupSegment(const GlobalValue *GV) { + return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; } -bool isGlobalSegment(const GlobalValue *GV, AMDGPUAS AS) { - return GV->getType()->getAddressSpace() == AS.GLOBAL_ADDRESS; +bool isGlobalSegment(const GlobalValue *GV) { + return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; } -bool isReadOnlySegment(const GlobalValue *GV, AMDGPUAS AS) { - return GV->getType()->getAddressSpace() == AS.CONSTANT_ADDRESS; +bool isReadOnlySegment(const GlobalValue *GV) { + return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; } bool shouldEmitConstantsToTextSection(const Triple &TT) { @@ -486,7 +565,9 @@ unsigned getInitialPSInputAddr(const Function &F) { bool isShader(CallingConv::ID cc) { switch(cc) { case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_LS: case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_ES: case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: @@ -508,7 +589,9 @@ bool isEntryFunctionCC(CallingConv::ID CC) { case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_ES: case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_LS: return true; default: return false; @@ -531,6 +614,10 @@ bool isGFX9(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGFX9]; } +bool isGCN3Encoding(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding]; +} + bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); const unsigned FirstSubReg = TRI->getSubReg(Reg, 1); @@ -545,44 +632,68 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) { return false; } -unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { +#define MAP_REG2REG \ + using namespace AMDGPU; \ + switch(Reg) { \ + default: return Reg; \ + CASE_CI_VI(FLAT_SCR) \ + CASE_CI_VI(FLAT_SCR_LO) \ + CASE_CI_VI(FLAT_SCR_HI) \ + CASE_VI_GFX9(TTMP0) \ + CASE_VI_GFX9(TTMP1) \ + CASE_VI_GFX9(TTMP2) \ + CASE_VI_GFX9(TTMP3) \ + CASE_VI_GFX9(TTMP4) \ + CASE_VI_GFX9(TTMP5) \ + CASE_VI_GFX9(TTMP6) \ + CASE_VI_GFX9(TTMP7) \ + CASE_VI_GFX9(TTMP8) \ + CASE_VI_GFX9(TTMP9) \ + CASE_VI_GFX9(TTMP10) \ + CASE_VI_GFX9(TTMP11) \ + CASE_VI_GFX9(TTMP12) \ + CASE_VI_GFX9(TTMP13) \ + CASE_VI_GFX9(TTMP14) \ + CASE_VI_GFX9(TTMP15) \ + CASE_VI_GFX9(TTMP0_TTMP1) \ + CASE_VI_GFX9(TTMP2_TTMP3) \ + CASE_VI_GFX9(TTMP4_TTMP5) \ + CASE_VI_GFX9(TTMP6_TTMP7) \ + CASE_VI_GFX9(TTMP8_TTMP9) \ + CASE_VI_GFX9(TTMP10_TTMP11) \ + CASE_VI_GFX9(TTMP12_TTMP13) \ + CASE_VI_GFX9(TTMP14_TTMP15) \ + CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3) \ + CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7) \ + CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11) \ + CASE_VI_GFX9(TTMP12_TTMP13_TTMP14_TTMP15) \ + } - switch(Reg) { - default: break; - case AMDGPU::FLAT_SCR: - assert(!isSI(STI)); - return isCI(STI) ? AMDGPU::FLAT_SCR_ci : AMDGPU::FLAT_SCR_vi; +#define CASE_CI_VI(node) \ + assert(!isSI(STI)); \ + case node: return isCI(STI) ? node##_ci : node##_vi; - case AMDGPU::FLAT_SCR_LO: - assert(!isSI(STI)); - return isCI(STI) ? AMDGPU::FLAT_SCR_LO_ci : AMDGPU::FLAT_SCR_LO_vi; +#define CASE_VI_GFX9(node) \ + case node: return isGFX9(STI) ? node##_gfx9 : node##_vi; - case AMDGPU::FLAT_SCR_HI: - assert(!isSI(STI)); - return isCI(STI) ? AMDGPU::FLAT_SCR_HI_ci : AMDGPU::FLAT_SCR_HI_vi; - } - return Reg; +unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { + MAP_REG2REG } -unsigned mc2PseudoReg(unsigned Reg) { - switch (Reg) { - case AMDGPU::FLAT_SCR_ci: - case AMDGPU::FLAT_SCR_vi: - return FLAT_SCR; +#undef CASE_CI_VI +#undef CASE_VI_GFX9 - case AMDGPU::FLAT_SCR_LO_ci: - case AMDGPU::FLAT_SCR_LO_vi: - return AMDGPU::FLAT_SCR_LO; +#define CASE_CI_VI(node) case node##_ci: case node##_vi: return node; +#define CASE_VI_GFX9(node) case node##_vi: case node##_gfx9: return node; - case AMDGPU::FLAT_SCR_HI_ci: - case AMDGPU::FLAT_SCR_HI_vi: - return AMDGPU::FLAT_SCR_HI; - - default: - return Reg; - } +unsigned mc2PseudoReg(unsigned Reg) { + MAP_REG2REG } +#undef CASE_CI_VI +#undef CASE_VI_GFX9 +#undef MAP_REG2REG + bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) { assert(OpNo < Desc.NumOperands); unsigned OpType = Desc.OpInfo[OpNo].OperandType; @@ -730,59 +841,66 @@ bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) { return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi); } +bool isArgPassedInSGPR(const Argument *A) { + const Function *F = A->getParent(); + + // Arguments to compute shaders are never a source of divergence. + CallingConv::ID CC = F->getCallingConv(); + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return true; + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_LS: + case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_ES: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + // For non-compute shaders, SGPR inputs are marked with either inreg or byval. + // Everything else is in VGPRs. + return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) || + F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal); + default: + // TODO: Should calls support inreg for SGPR inputs? + return false; + } +} + +// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence. bool isUniformMMO(const MachineMemOperand *MMO) { const Value *Ptr = MMO->getValue(); // UndefValue means this is a load of a kernel input. These are uniform. // Sometimes LDS instructions have constant pointers. // If Ptr is null, then that means this mem operand contains a // PseudoSourceValue like GOT. - if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || + if (!Ptr || isa<UndefValue>(Ptr) || isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) return true; + if (const Argument *Arg = dyn_cast<Argument>(Ptr)) + return isArgPassedInSGPR(Arg); + const Instruction *I = dyn_cast<Instruction>(Ptr); return I && I->getMetadata("amdgpu.uniform"); } int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) { - if (isSI(ST) || isCI(ST)) - return ByteOffset >> 2; - - return ByteOffset; + if (isGCN3Encoding(ST)) + return ByteOffset; + return ByteOffset >> 2; } bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) { int64_t EncodedOffset = getSMRDEncodedOffset(ST, ByteOffset); - return isSI(ST) || isCI(ST) ? isUInt<8>(EncodedOffset) : - isUInt<20>(EncodedOffset); + return isGCN3Encoding(ST) ? + isUInt<20>(EncodedOffset) : isUInt<8>(EncodedOffset); } + } // end namespace AMDGPU } // end namespace llvm -const unsigned AMDGPUAS::MAX_COMMON_ADDRESS; -const unsigned AMDGPUAS::GLOBAL_ADDRESS; -const unsigned AMDGPUAS::LOCAL_ADDRESS; -const unsigned AMDGPUAS::PARAM_D_ADDRESS; -const unsigned AMDGPUAS::PARAM_I_ADDRESS; -const unsigned AMDGPUAS::CONSTANT_BUFFER_0; -const unsigned AMDGPUAS::CONSTANT_BUFFER_1; -const unsigned AMDGPUAS::CONSTANT_BUFFER_2; -const unsigned AMDGPUAS::CONSTANT_BUFFER_3; -const unsigned AMDGPUAS::CONSTANT_BUFFER_4; -const unsigned AMDGPUAS::CONSTANT_BUFFER_5; -const unsigned AMDGPUAS::CONSTANT_BUFFER_6; -const unsigned AMDGPUAS::CONSTANT_BUFFER_7; -const unsigned AMDGPUAS::CONSTANT_BUFFER_8; -const unsigned AMDGPUAS::CONSTANT_BUFFER_9; -const unsigned AMDGPUAS::CONSTANT_BUFFER_10; -const unsigned AMDGPUAS::CONSTANT_BUFFER_11; -const unsigned AMDGPUAS::CONSTANT_BUFFER_12; -const unsigned AMDGPUAS::CONSTANT_BUFFER_13; -const unsigned AMDGPUAS::CONSTANT_BUFFER_14; -const unsigned AMDGPUAS::CONSTANT_BUFFER_15; -const unsigned AMDGPUAS::UNKNOWN_ADDRESS_SPACE; - namespace llvm { namespace AMDGPU { diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 936e4921a709..a215b445378e 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -19,10 +19,12 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" #include <cstdint> +#include <string> #include <utility> namespace llvm { +class Argument; class FeatureBitset; class Function; class GlobalValue; @@ -53,6 +55,13 @@ struct IsaVersion { /// \returns Isa version for given subtarget \p Features. IsaVersion getIsaVersion(const FeatureBitset &Features); +/// \brief Streams isa version string for given subtarget \p STI into \p Stream. +void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream); + +/// \returns True if given subtarget \p Features support code object version 3, +/// false otherwise. +bool hasCodeObjectV3(const FeatureBitset &Features); + /// \returns Wavefront size for given subtarget \p Features. unsigned getWavefrontSize(const FeatureBitset &Features); @@ -147,12 +156,18 @@ unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU); LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx); +LLVM_READONLY +int getMaskedMIMGOp(const MCInstrInfo &MII, + unsigned Opc, unsigned NewChannels); +LLVM_READONLY +int getMCOpcode(uint16_t Opcode, unsigned Gen); + void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const FeatureBitset &Features); -bool isGroupSegment(const GlobalValue *GV, AMDGPUAS AS); -bool isGlobalSegment(const GlobalValue *GV, AMDGPUAS AS); -bool isReadOnlySegment(const GlobalValue *GV, AMDGPUAS AS); +bool isGroupSegment(const GlobalValue *GV); +bool isGlobalSegment(const GlobalValue *GV); +bool isReadOnlySegment(const GlobalValue *GV); /// \returns True if constants should be emitted to .text section for given /// target triple \p TT, false otherwise. @@ -347,6 +362,7 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi); LLVM_READNONE bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi); +bool isArgPassedInSGPR(const Argument *Arg); bool isUniformMMO(const MachineMemOperand *MMO); /// \returns The encoding that will be used for \p ByteOffset in the SMRD diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp index 0333b0a14d29..20059f4a1ed7 100644 --- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp +++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp @@ -1,4 +1,4 @@ -//===--------------------AMDKernelCodeTUtils.cpp --------------------------===// +//===- AMDKernelCodeTUtils.cpp --------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -7,17 +7,21 @@ // //===----------------------------------------------------------------------===// // -//===----------------------------------------------------------------------===// -// /// \file - utility functions to parse/print amd_kernel_code_t structure // //===----------------------------------------------------------------------===// #include "AMDKernelCodeTUtils.h" #include "SIDefines.h" -#include <llvm/MC/MCParser/MCAsmLexer.h> -#include <llvm/MC/MCParser/MCAsmParser.h> -#include <llvm/Support/raw_ostream.h> +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <cstdint> +#include <utility> using namespace llvm; @@ -62,7 +66,6 @@ static StringRef get_amd_kernel_code_t_FieldName(int index) { return get_amd_kernel_code_t_FldNames()[index + 1]; } - // Field printing static raw_ostream &printName(raw_ostream &OS, StringRef Name) { @@ -82,9 +85,7 @@ static void printBitField(StringRef Name, const amd_kernel_code_t &c, printName(OS, Name) << (int)((c.*ptr >> shift) & Mask); } -typedef void(*PrintFx)(StringRef, - const amd_kernel_code_t &, - raw_ostream &); +using PrintFx = void(*)(StringRef, const amd_kernel_code_t &, raw_ostream &); static ArrayRef<PrintFx> getPrinterTable() { static const PrintFx Table[] = { @@ -114,7 +115,6 @@ void llvm::dumpAmdKernelCode(const amd_kernel_code_t *C, } } - // Field parsing static bool expectAbsExpression(MCAsmParser &MCParser, int64_t &Value, raw_ostream& Err) { @@ -154,9 +154,8 @@ static bool parseBitField(amd_kernel_code_t &C, MCAsmParser &MCParser, return true; } -typedef bool(*ParseFx)(amd_kernel_code_t &, - MCAsmParser &MCParser, - raw_ostream &Err); +using ParseFx = bool(*)(amd_kernel_code_t &, MCAsmParser &MCParser, + raw_ostream &Err); static ArrayRef<ParseFx> getParserTable() { static const ParseFx Table[] = { diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h index d9edca7a82ac..ef9f9bdb6bcb 100644 --- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h +++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h @@ -1,4 +1,4 @@ -//===- AMDGPUKernelCodeTUtils.h - helpers for amd_kernel_code_t *- C++ -*-===// +//===- AMDGPUKernelCodeTUtils.h - helpers for amd_kernel_code_t -*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -6,34 +6,31 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +// /// \file AMDKernelCodeTUtils.h +// //===----------------------------------------------------------------------===// -#ifndef AMDKERNELCODETUTILS_H -#define AMDKERNELCODETUTILS_H +#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDKERNELCODETUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDKERNELCODETUTILS_H #include "AMDKernelCodeT.h" namespace llvm { -class MCAsmLexer; class MCAsmParser; class raw_ostream; class StringRef; -void printAmdKernelCodeField(const amd_kernel_code_t &C, - int FldIndex, - raw_ostream &OS); +void printAmdKernelCodeField(const amd_kernel_code_t &C, int FldIndex, + raw_ostream &OS); -void dumpAmdKernelCode(const amd_kernel_code_t *C, - raw_ostream &OS, - const char *tab); +void dumpAmdKernelCode(const amd_kernel_code_t *C, raw_ostream &OS, + const char *tab); -bool parseAmdKernelCodeField(StringRef ID, - MCAsmParser &Parser, - amd_kernel_code_t &C, - raw_ostream &Err); +bool parseAmdKernelCodeField(StringRef ID, MCAsmParser &Parser, + amd_kernel_code_t &C, raw_ostream &Err); -} +} // end namespace llvm -#endif // AMDKERNELCODETUTILS_H +#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDKERNELCODETUTILS_H diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td index 96b33c373f05..ff2bd2454400 100644 --- a/lib/Target/AMDGPU/VOP1Instructions.td +++ b/lib/Target/AMDGPU/VOP1Instructions.td @@ -266,7 +266,8 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> { let Outs = (outs); let Ins32 = (ins Src0RC32:$vdst, VSrc_b32:$src0); let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0); - let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + let InsDPP = (ins DstRC:$vdst, DstRC:$old, Src0RC32:$src0, + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, @@ -274,7 +275,7 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> { src0_sel:$src0_sel); let Asm32 = getAsm32<1, 1>.ret; - let Asm64 = getAsm64<1, 1, 0, 1>.ret; + let Asm64 = getAsm64<1, 1, 0, 0, 1>.ret; let AsmDPP = getAsmDPP<1, 1, 0>.ret; let AsmSDWA = getAsmSDWA<1, 1>.ret; let AsmSDWA9 = getAsmSDWA9<1, 0, 1>.ret; @@ -360,14 +361,14 @@ defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; } -let Predicates = [Has16BitInsts] in { +let OtherPredicates = [Has16BitInsts] in { -def : Pat< +def : GCNPat< (f32 (f16_to_fp i16:$src)), (V_CVT_F32_F16_e32 $src) >; -def : Pat< +def : GCNPat< (i16 (AMDGPUfp_to_f16 f32:$src)), (V_CVT_F16_F32_e32 $src) >; @@ -504,8 +505,6 @@ class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> : let Uses = ps.Uses; let SchedRW = ps.SchedRW; let hasSideEffects = ps.hasSideEffects; - let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; bits<8> vdst; let Inst{8-0} = 0xfa; // dpp @@ -654,36 +653,44 @@ def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>; def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>; def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>; -let Predicates = [isVI] in { +let OtherPredicates = [isVI] in { -def : Pat < +def : GCNPat < (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, imm:$bound_ctrl)), - (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask), - (as_i32imm $bank_mask), (as_i1imm $bound_ctrl)) + (V_MOV_B32_dpp $src, $src, (as_i32imm $dpp_ctrl), + (as_i32imm $row_mask), (as_i32imm $bank_mask), + (as_i1imm $bound_ctrl)) >; +def : GCNPat < + (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, imm:$dpp_ctrl, imm:$row_mask, + imm:$bank_mask, imm:$bound_ctrl)), + (V_MOV_B32_dpp $old, $src, (as_i32imm $dpp_ctrl), + (as_i32imm $row_mask), (as_i32imm $bank_mask), + (as_i1imm $bound_ctrl)) +>; -def : Pat< +def : GCNPat< (i32 (anyext i16:$src)), (COPY $src) >; -def : Pat< +def : GCNPat< (i64 (anyext i16:$src)), (REG_SEQUENCE VReg_64, (i32 (COPY $src)), sub0, (V_MOV_B32_e32 (i32 0)), sub1) >; -def : Pat< +def : GCNPat< (i16 (trunc i32:$src)), (COPY $src) >; -def : Pat < +def : GCNPat < (i16 (trunc i64:$src)), (EXTRACT_SUBREG $src, sub0) >; -} // End Predicates = [isVI] +} // End OtherPredicates = [isVI] diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index d5acb49b4f39..ef90b68db1a8 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -128,35 +128,42 @@ class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { multiclass VOP2Inst <string opName, VOPProfile P, SDPatternOperator node = null_frag, - string revOp = opName> { + string revOp = opName, + bit GFX9Renamed = 0> { - def _e32 : VOP2_Pseudo <opName, P>, - Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; + let renamedInGFX9 = GFX9Renamed in { - def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, - Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; + def _e32 : VOP2_Pseudo <opName, P>, + Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; - def _sdwa : VOP2_SDWA_Pseudo <opName, P>; + def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, + Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; + + def _sdwa : VOP2_SDWA_Pseudo <opName, P>; + + } } multiclass VOP2bInst <string opName, VOPProfile P, SDPatternOperator node = null_frag, string revOp = opName, + bit GFX9Renamed = 0, bit useSGPRInput = !eq(P.NumSrcArgs, 3)> { - - let SchedRW = [Write32Bit, WriteSALU] in { - let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in { - def _e32 : VOP2_Pseudo <opName, P>, - Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; - - def _sdwa : VOP2_SDWA_Pseudo <opName, P> { - let AsmMatchConverter = "cvtSdwaVOP2b"; + let renamedInGFX9 = GFX9Renamed in { + let SchedRW = [Write32Bit, WriteSALU] in { + let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in { + def _e32 : VOP2_Pseudo <opName, P>, + Commutable_REV<revOp#"_e32", !eq(revOp, opName)>; + + def _sdwa : VOP2_SDWA_Pseudo <opName, P> { + let AsmMatchConverter = "cvtSdwaVOP2b"; + } } - } - def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, - Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; + def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>, + Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; + } } } @@ -208,10 +215,10 @@ def VOP_MADMK_F32 : VOP_MADMK <f32>; class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3, - HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; - let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, + 0, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret; + let InsDPP = (ins DstRCDPP:$old, + Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, - VGPR_32:$src2, // stub argument dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); @@ -222,7 +229,7 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); let Asm32 = getAsm32<1, 2, vt>.ret; - let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret; + let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, vt>.ret; let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret; let AsmSDWA = getAsmSDWA<1, 2, vt>.ret; let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret; @@ -235,13 +242,13 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> { def VOP_MAC_F16 : VOP_MAC <f16> { // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives // 'not a string initializer' error. - let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, f16>.ret; + let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, f16>.ret; } def VOP_MAC_F32 : VOP_MAC <f32> { // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives // 'not a string initializer' error. - let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, f32>.ret; + let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, f32>.ret; } // Write out to vcc or arbitrary SGPR. @@ -278,12 +285,13 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0, Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1, - clampmod:$clamp, omod:$omod, + clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, src0_sel:$src0_sel, src1_sel:$src1_sel); - let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0, - Src1Mod:$src1_modifiers, Src1DPP:$src1, + let InsDPP = (ins DstRCDPP:$old, + Src0DPP:$src0, + Src1DPP:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); let HasExt = 1; @@ -369,12 +377,20 @@ def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, [], "">; // V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI, // but the VI instructions behave the same as the SI versions. -defm V_ADD_I32 : VOP2bInst <"v_add_i32", VOP2b_I32_I1_I32_I32>; -defm V_SUB_I32 : VOP2bInst <"v_sub_i32", VOP2b_I32_I1_I32_I32>; -defm V_SUBREV_I32 : VOP2bInst <"v_subrev_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32">; -defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1>; -defm V_SUBB_U32 : VOP2bInst <"v_subb_u32", VOP2b_I32_I1_I32_I32_I1>; -defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32">; +defm V_ADD_I32 : VOP2bInst <"v_add_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_add_i32", 1>; +defm V_SUB_I32 : VOP2bInst <"v_sub_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32", 1>; +defm V_SUBREV_I32 : VOP2bInst <"v_subrev_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32", 1>; +defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32", 1>; +defm V_SUBB_U32 : VOP2bInst <"v_subb_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32", 1>; +defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32", 1>; + + +let SubtargetPredicate = HasAddNoCarryInsts in { +defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32, null_frag, "v_add_u32", 1>; +defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32, null_frag, "v_sub_u32", 1>; +defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32, null_frag, "v_sub_u32", 1>; +} + } // End isCommutable = 1 // These are special and do not read the exec mask. @@ -399,12 +415,12 @@ defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32 } // End SubtargetPredicate = isGCN -def : Pat< +def : GCNPat< (AMDGPUadde i32:$src0, i32:$src1, i1:$src2), (V_ADDC_U32_e64 $src0, $src1, $src2) >; -def : Pat< +def : GCNPat< (AMDGPUsube i32:$src0, i32:$src1, i1:$src2), (V_SUBB_U32_e64 $src0, $src1, $src2) >; @@ -460,17 +476,17 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>; // Note: 16-bit instructions produce a 0 result in the high 16-bits. multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> { -def : Pat< +def : GCNPat< (op i16:$src0, i16:$src1), (inst $src0, $src1) >; -def : Pat< +def : GCNPat< (i32 (zext (op i16:$src0, i16:$src1))), (inst $src0, $src1) >; -def : Pat< +def : GCNPat< (i64 (zext (op i16:$src0, i16:$src1))), (REG_SEQUENCE VReg_64, (inst $src0, $src1), sub0, @@ -481,18 +497,18 @@ def : Pat< multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst> { -def : Pat< +def : GCNPat< (op i16:$src0, i16:$src1), (inst $src1, $src0) >; -def : Pat< +def : GCNPat< (i32 (zext (op i16:$src0, i16:$src1))), (inst $src1, $src0) >; -def : Pat< +def : GCNPat< (i64 (zext (op i16:$src0, i16:$src1))), (REG_SEQUENCE VReg_64, (inst $src1, $src0), sub0, @@ -500,7 +516,7 @@ def : Pat< >; } -class ZExt_i16_i1_Pat <SDNode ext> : Pat < +class ZExt_i16_i1_Pat <SDNode ext> : GCNPat < (i16 (ext i1:$src)), (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src) >; @@ -515,17 +531,17 @@ defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64>; defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64>; defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64>; -def : Pat < +def : GCNPat < (and i16:$src0, i16:$src1), (V_AND_B32_e64 $src0, $src1) >; -def : Pat < +def : GCNPat < (or i16:$src0, i16:$src1), (V_OR_B32_e64 $src0, $src1) >; -def : Pat < +def : GCNPat < (xor i16:$src0, i16:$src1), (V_XOR_B32_e64 $src0, $src1) >; @@ -537,7 +553,7 @@ defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64>; def : ZExt_i16_i1_Pat<zext>; def : ZExt_i16_i1_Pat<anyext>; -def : Pat < +def : GCNPat < (i16 (sext i1:$src)), (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src) >; @@ -545,7 +561,7 @@ def : Pat < // Undo sub x, c -> add x, -c canonicalization since c is more likely // an inline immediate than -c. // TODO: Also do for 64-bit. -def : Pat< +def : GCNPat< (add i16:$src0, (i16 NegSubInlineConst16:$src1)), (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1) >; @@ -651,14 +667,12 @@ defm V_CVT_PK_I16_I32 : VOP2_Real_e32e64_si <0x31>; // VI //===----------------------------------------------------------------------===// -class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, VOPProfile P = ps.Pfl> : - VOP_DPP <ps.OpName, P> { +class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, string OpName = ps.OpName, VOPProfile P = ps.Pfl> : + VOP_DPP <OpName, P> { let Defs = ps.Defs; let Uses = ps.Uses; let SchedRW = ps.SchedRW; let hasSideEffects = ps.hasSideEffects; - let Constraints = ps.Constraints; - let DisableEncoding = ps.DisableEncoding; bits<8> vdst; bits<8> src1; @@ -705,12 +719,6 @@ multiclass VOP2_Real_e64only_vi <bits<10> op> { } } -multiclass Base_VOP2be_Real_e32e64_vi <bits<6> op> : VOP2_Real_e32_vi<op> { - def _e64_vi : - VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, - VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>; -} - multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> : VOP2_Real_e32_vi<op>, VOP2_Real_e64_vi<{0, 1, 0, 0, op{5-0}}>; @@ -729,13 +737,86 @@ multiclass VOP2_SDWA9_Real <bits<6> op> { VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>; } -multiclass VOP2be_Real_e32e64_vi <bits<6> op> : - Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> { - // For now left dpp only for asm/dasm - // TODO: add corresponding pseudo - def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>; +let AssemblerPredicates = [isVIOnly] in { + +multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName> { + def _e32_vi : + VOP2_Real<!cast<VOP2_Pseudo>(OpName#"_e32"), SIEncodingFamily.VI>, + VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> { + VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32"); + let AsmString = AsmName # ps.AsmOperands; + let DecoderNamespace = "VI"; + } + def _e64_vi : + VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.VI>, + VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> { + VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64"); + let AsmString = AsmName # ps.AsmOperands; + let DecoderNamespace = "VI"; + } + def _sdwa_vi : + VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>, + VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa"); + let AsmString = AsmName # ps.AsmOperands; + } + def _dpp : + VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName>; +} +} + +let AssemblerPredicates = [isGFX9] in { + +multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> { + def _e32_gfx9 : + VOP2_Real<!cast<VOP2_Pseudo>(OpName#"_e32"), SIEncodingFamily.GFX9>, + VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> { + VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32"); + let AsmString = AsmName # ps.AsmOperands; + let DecoderNamespace = "GFX9"; + } + def _e64_gfx9 : + VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.GFX9>, + VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> { + VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64"); + let AsmString = AsmName # ps.AsmOperands; + let DecoderNamespace = "GFX9"; + } + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>, + VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> { + VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa"); + let AsmString = AsmName # ps.AsmOperands; + } + def _dpp_gfx9 : + VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName> { + let DecoderNamespace = "SDWA9"; + } } +multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> { + def _e32_gfx9 : + VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX9>, + VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>{ + let DecoderNamespace = "GFX9"; + } + def _e64_gfx9 : + VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX9>, + VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> { + let DecoderNamespace = "GFX9"; + } + def _sdwa_gfx9 : + VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>, + VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> { + } + def _dpp_gfx9 : + VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")> { + let DecoderNamespace = "SDWA9"; + } +} + +} // AssemblerPredicates = [isGFX9] + multiclass VOP2_Real_e32e64_vi <bits<6> op> : Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> { // For now left dpp only for asm/dasm @@ -768,12 +849,24 @@ defm V_XOR_B32 : VOP2_Real_e32e64_vi <0x15>; defm V_MAC_F32 : VOP2_Real_e32e64_vi <0x16>; defm V_MADMK_F32 : VOP2_Real_MADK_vi <0x17>; defm V_MADAK_F32 : VOP2_Real_MADK_vi <0x18>; -defm V_ADD_I32 : VOP2be_Real_e32e64_vi <0x19>; -defm V_SUB_I32 : VOP2be_Real_e32e64_vi <0x1a>; -defm V_SUBREV_I32 : VOP2be_Real_e32e64_vi <0x1b>; -defm V_ADDC_U32 : VOP2be_Real_e32e64_vi <0x1c>; -defm V_SUBB_U32 : VOP2be_Real_e32e64_vi <0x1d>; -defm V_SUBBREV_U32 : VOP2be_Real_e32e64_vi <0x1e>; + +defm V_ADD_U32 : VOP2be_Real_e32e64_vi_only <0x19, "V_ADD_I32", "v_add_u32">; +defm V_SUB_U32 : VOP2be_Real_e32e64_vi_only <0x1a, "V_SUB_I32", "v_sub_u32">; +defm V_SUBREV_U32 : VOP2be_Real_e32e64_vi_only <0x1b, "V_SUBREV_I32", "v_subrev_u32">; +defm V_ADDC_U32 : VOP2be_Real_e32e64_vi_only <0x1c, "V_ADDC_U32", "v_addc_u32">; +defm V_SUBB_U32 : VOP2be_Real_e32e64_vi_only <0x1d, "V_SUBB_U32", "v_subb_u32">; +defm V_SUBBREV_U32 : VOP2be_Real_e32e64_vi_only <0x1e, "V_SUBBREV_U32", "v_subbrev_u32">; + +defm V_ADD_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x19, "V_ADD_I32", "v_add_co_u32">; +defm V_SUB_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1a, "V_SUB_I32", "v_sub_co_u32">; +defm V_SUBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1b, "V_SUBREV_I32", "v_subrev_co_u32">; +defm V_ADDC_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1c, "V_ADDC_U32", "v_addc_co_u32">; +defm V_SUBB_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1d, "V_SUBB_U32", "v_subb_co_u32">; +defm V_SUBBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_subbrev_co_u32">; + +defm V_ADD_U32 : VOP2_Real_e32e64_gfx9 <0x34>; +defm V_SUB_U32 : VOP2_Real_e32e64_gfx9 <0x35>; +defm V_SUBREV_U32 : VOP2_Real_e32e64_gfx9 <0x36>; defm V_READLANE_B32 : VOP32_Real_vi <0x289>; defm V_WRITELANE_B32 : VOP32_Real_vi <0x28a>; diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index 92ed0706dc01..aedbfa015bf6 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -53,6 +53,46 @@ class getVOP3PModPat<VOPProfile P, SDPatternOperator node> { ret1)); } +class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> { + list<dag> ret3 = [(set P.DstVT:$vdst, + (node (P.Src0VT !if(P.HasClamp, (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), + (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))), + (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3OpSel P.Src2VT:$src2, i32:$src2_modifiers))))]; + + list<dag> ret2 = [(set P.DstVT:$vdst, + (node !if(P.HasClamp, (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)), + (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))), + (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers))))]; + + list<dag> ret1 = [(set P.DstVT:$vdst, + (node (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))]; + + list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, + !if(!eq(P.NumSrcArgs, 2), ret2, + ret1)); +} + +class getVOP3OpSelModPat<VOPProfile P, SDPatternOperator node> { + list<dag> ret3 = [(set P.DstVT:$vdst, + (node (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp), + (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))), + (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers)), + (P.Src2VT (VOP3OpSelMods P.Src2VT:$src2, i32:$src2_modifiers))))]; + + list<dag> ret2 = [(set P.DstVT:$vdst, + (node !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)), + (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))), + (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers))))]; + + list<dag> ret1 = [(set P.DstVT:$vdst, + (node (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))]; + + list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, + !if(!eq(P.NumSrcArgs, 2), ret2, + ret1)); +} + class getVOP3Pat<VOPProfile P, SDPatternOperator node> { list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))]; list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]; @@ -62,10 +102,36 @@ class getVOP3Pat<VOPProfile P, SDPatternOperator node> { ret1)); } +class getVOP3ClampPat<VOPProfile P, SDPatternOperator node> { + list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, i1:$clamp))]; + list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, i1:$clamp))]; + list<dag> ret1 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, i1:$clamp))]; + list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, + !if(!eq(P.NumSrcArgs, 2), ret2, + ret1)); +} + class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> : VOP3_Pseudo<OpName, P, - !if(P.HasModifiers, getVOP3ModPat<P, node>.ret, getVOP3Pat<P, node>.ret), - VOP3Only>; + !if(P.HasOpSel, + !if(P.HasModifiers, + getVOP3OpSelModPat<P, node>.ret, + getVOP3OpSelPat<P, node>.ret), + !if(P.HasModifiers, + getVOP3ModPat<P, node>.ret, + !if(P.HasIntClamp, + getVOP3ClampPat<P, node>.ret, + getVOP3Pat<P, node>.ret))), + VOP3Only, 0, P.HasOpSel> { + + let IntClamp = P.HasIntClamp; + let AsmMatchConverter = + !if(P.HasOpSel, + "cvtVOP3OpSel", + !if(!or(P.HasModifiers, !or(P.HasOMod, P.HasIntClamp)), + "cvtVOP3", + "")); +} // Special case for v_div_fmas_{f32|f64}, since it seems to be the // only VOP instruction that implicitly reads VCC. @@ -87,10 +153,33 @@ class getVOP3VCC<VOPProfile P, SDPatternOperator node> { (i1 VCC)))]; } -class VOP3_Profile<VOPProfile P> : VOPProfile<P.ArgVT> { +class VOP3Features<bit Clamp, bit OpSel> { + bit HasClamp = Clamp; + bit HasOpSel = OpSel; +} + +def VOP3_REGULAR : VOP3Features<0, 0>; +def VOP3_CLAMP : VOP3Features<1, 0>; +def VOP3_OPSEL : VOP3Features<1, 1>; + +class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile<P.ArgVT> { + + let HasClamp = !if(Features.HasClamp, 1, P.HasClamp); + let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel); + // FIXME: Hack to stop printing _e64 let Outs64 = (outs DstRC.RegClass:$vdst); - let Asm64 = " " # P.Asm64; + let Asm64 = + " " # !if(Features.HasOpSel, + getAsmVOP3OpSel<NumSrcArgs, + HasIntClamp, + HasSrc0FloatMods, + HasSrc1FloatMods, + HasSrc2FloatMods>.ret, + !if(Features.HasClamp, + getAsm64<HasDst, NumSrcArgs, HasIntClamp, + HasModifiers, HasOMod, DstVT>.ret, + P.Asm64)); } class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> { @@ -112,11 +201,75 @@ def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> { } def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { + let HasClamp = 1; + // FIXME: Hack to stop printing _e64 let DstRC = RegisterOperand<VReg_64>; let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); - let Asm64 = " $vdst, $sdst, $src0, $src1, $src2"; + let Asm64 = " $vdst, $sdst, $src0, $src1, $src2$clamp"; +} + +//===----------------------------------------------------------------------===// +// VOP3 INTERP +//===----------------------------------------------------------------------===// + +class VOP3Interp<string OpName, VOPProfile P> : VOP3_Pseudo<OpName, P> { + let AsmMatchConverter = "cvtVOP3Interp"; +} + +def VOP3_INTERP : VOPProfile<[f32, f32, i32, untyped]> { + let Ins64 = (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0, + Attr:$attr, AttrChan:$attrchan, + clampmod:$clamp, omod:$omod); + + let Asm64 = "$vdst, $src0_modifiers, $attr$attrchan$clamp$omod"; +} + +def VOP3_INTERP_MOV : VOPProfile<[f32, i32, i32, untyped]> { + let Ins64 = (ins InterpSlot:$src0, + Attr:$attr, AttrChan:$attrchan, + clampmod:$clamp, omod:$omod); + + let Asm64 = "$vdst, $src0, $attr$attrchan$clamp$omod"; + + let HasClamp = 1; +} + +class getInterp16Asm <bit HasSrc2, bit HasOMod> { + string src2 = !if(HasSrc2, ", $src2_modifiers", ""); + string omod = !if(HasOMod, "$omod", ""); + string ret = + " $vdst, $src0_modifiers, $attr$attrchan"#src2#"$high$clamp"#omod; +} + +class getInterp16Ins <bit HasSrc2, bit HasOMod, + Operand Src0Mod, Operand Src2Mod> { + dag ret = !if(HasSrc2, + !if(HasOMod, + (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0, + Attr:$attr, AttrChan:$attrchan, + Src2Mod:$src2_modifiers, VRegSrc_32:$src2, + highmod:$high, clampmod:$clamp, omod:$omod), + (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0, + Attr:$attr, AttrChan:$attrchan, + Src2Mod:$src2_modifiers, VRegSrc_32:$src2, + highmod:$high, clampmod:$clamp) + ), + (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0, + Attr:$attr, AttrChan:$attrchan, + highmod:$high, clampmod:$clamp, omod:$omod) + ); +} + +class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> { + + let HasOMod = !if(!eq(DstVT.Value, f16.Value), 0, 1); + let HasHigh = 1; + + let Outs64 = (outs VGPR_32:$vdst); + let Ins64 = getInterp16Ins<HasSrc2, HasOMod, Src0Mod, Src2Mod>.ret; + let Asm64 = getInterp16Asm<HasSrc2, HasOMod>.ret; } //===----------------------------------------------------------------------===// @@ -127,8 +280,8 @@ let isCommutable = 1 in { def V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>; -def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUmad_i24>; -def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUmad_u24>; +def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; +def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>; def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>; def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>; @@ -188,10 +341,10 @@ def V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDG def V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>; def V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>; def V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>; -def V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sad_u8>; -def V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sad_hi_u8>; -def V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_sad_u16>; -def V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; +def V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; +def V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; +def V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; +def V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; def V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>; def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>; @@ -213,10 +366,10 @@ def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, let AsmMatchConverter = ""; } -def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_msad_u8>; +def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>; let Constraints = "@earlyclobber $vdst" in { -def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_mqsad_pk_u16_u8>; +def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>; } // End Constraints = "@earlyclobber $vdst" def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUtrig_preop> { @@ -241,13 +394,15 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>; let SubtargetPredicate = isCIVI in { let Constraints = "@earlyclobber $vdst" in { -def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_qsad_pk_u16_u8>; -def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>; +def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>; +def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP>>; } // End Constraints = "@earlyclobber $vdst" let isCommutable = 1 in { +let SchedRW = [WriteDouble, WriteSALU] in { def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>; def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; +} // End SchedRW = [WriteDouble, WriteSALU] } // End isCommutable = 1 } // End SubtargetPredicate = isCIVI @@ -255,23 +410,42 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; let SubtargetPredicate = Has16BitInsts in { -def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>; +let renamedInGFX9 = 1 in { +def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>; +} +let SubtargetPredicate = isGFX9 in { +def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>; +} let isCommutable = 1 in { -def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>; -def V_INTERP_P1LL_F16 : VOP3Inst <"v_interp_p1ll_f16", VOP3_Profile<VOP_F32_F32_F16>>; -def V_INTERP_P1LV_F16 : VOP3Inst <"v_interp_p1lv_f16", VOP3_Profile<VOP_F32_F32_F16_F16>>; -def V_INTERP_P2_F16 : VOP3Inst <"v_interp_p2_f16", VOP3_Profile<VOP_F16_F32_F16_F32>>; -def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>; +let renamedInGFX9 = 1 in { +def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>; +def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>; +def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>; +def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>; +def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>; +} + +let SubtargetPredicate = isGFX9 in { +def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>; +def V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>; +def V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>; +def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>; +def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>; +} // End SubtargetPredicate = isGFX9 -def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16>>; -def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>; +def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>; +def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>; } // End isCommutable = 1 } // End SubtargetPredicate = Has16BitInsts let SubtargetPredicate = isVI in { +def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>; +def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>; +def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>; + def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; } // End SubtargetPredicate = isVI @@ -279,20 +453,20 @@ let Predicates = [Has16BitInsts] in { multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst, SDPatternOperator op3> { -def : Pat< +def : GCNPat < (op2 (op1 i16:$src0, i16:$src1), i16:$src2), - (inst i16:$src0, i16:$src1, i16:$src2) + (inst i16:$src0, i16:$src1, i16:$src2, (i1 0)) >; -def : Pat< +def : GCNPat< (i32 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))), - (inst i16:$src0, i16:$src1, i16:$src2) + (inst i16:$src0, i16:$src1, i16:$src2, (i1 0)) >; -def : Pat< +def : GCNPat< (i64 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))), (REG_SEQUENCE VReg_64, - (inst i16:$src0, i16:$src1, i16:$src2), sub0, + (inst i16:$src0, i16:$src1, i16:$src2, (i1 0)), sub0, (V_MOV_B32_e32 (i32 0)), sub1) >; } @@ -303,7 +477,7 @@ defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>; } // End Predicates = [Has16BitInsts] let SubtargetPredicate = isGFX9 in { -def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16>>; +def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>; def V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; def V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; def V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; @@ -313,19 +487,70 @@ def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>; def V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>; -def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmed3>; -def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmed3>; -def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumed3>; +def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmed3>; +def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmed3>; +def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumed3>; + +def V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmin3>; +def V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmin3>; +def V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumin3>; + +def V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmax3>; +def V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmax3>; +def V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumax3>; + +def V_ADD_I16 : VOP3Inst <"v_add_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>; +def V_SUB_I16 : VOP3Inst <"v_sub_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>; -def V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmin3>; -def V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmin3>; -def V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumin3>; +def V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>; +def V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>; -def V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmax3>; -def V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmax3>; -def V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumax3>; +def V_CVT_PKNORM_I16_F16 : VOP3Inst <"v_cvt_pknorm_i16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>; +def V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>; + +def V_ADD_I32_gfx9 : VOP3Inst <"v_add_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>; +def V_SUB_I32_gfx9 : VOP3Inst <"v_sub_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>; } // End SubtargetPredicate = isGFX9 +//===----------------------------------------------------------------------===// +// Integer Clamp Patterns +//===----------------------------------------------------------------------===// + +class getClampPat<VOPProfile P, SDPatternOperator node> { + dag ret3 = (P.DstVT (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2)); + dag ret2 = (P.DstVT (node P.Src0VT:$src0, P.Src1VT:$src1)); + dag ret1 = (P.DstVT (node P.Src0VT:$src0)); + dag ret = !if(!eq(P.NumSrcArgs, 3), ret3, + !if(!eq(P.NumSrcArgs, 2), ret2, + ret1)); +} + +class getClampRes<VOPProfile P, Instruction inst> { + dag ret3 = (inst P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2, (i1 0)); + dag ret2 = (inst P.Src0VT:$src0, P.Src1VT:$src1, (i1 0)); + dag ret1 = (inst P.Src0VT:$src0, (i1 0)); + dag ret = !if(!eq(P.NumSrcArgs, 3), ret3, + !if(!eq(P.NumSrcArgs, 2), ret2, + ret1)); +} + +class IntClampPat<VOP3Inst inst, SDPatternOperator node> : GCNPat< + getClampPat<inst.Pfl, node>.ret, + getClampRes<inst.Pfl, inst>.ret +>; + +def : IntClampPat<V_MAD_I32_I24, AMDGPUmad_i24>; +def : IntClampPat<V_MAD_U32_U24, AMDGPUmad_u24>; + +def : IntClampPat<V_SAD_U8, int_amdgcn_sad_u8>; +def : IntClampPat<V_SAD_HI_U8, int_amdgcn_sad_hi_u8>; +def : IntClampPat<V_SAD_U16, int_amdgcn_sad_u16>; + +def : IntClampPat<V_MSAD_U8, int_amdgcn_msad_u8>; +def : IntClampPat<V_MQSAD_PK_U16_U8, int_amdgcn_mqsad_pk_u16_u8>; + +def : IntClampPat<V_QSAD_PK_U16_U8, int_amdgcn_qsad_pk_u16_u8>; +def : IntClampPat<V_MQSAD_U32_U8, int_amdgcn_mqsad_u32_u8>; //===----------------------------------------------------------------------===// // Target @@ -443,8 +668,68 @@ multiclass VOP3be_Real_vi<bits<10> op> { VOP3be_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>; } +multiclass VOP3OpSel_Real_gfx9<bits<10> op> { + def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, + VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(NAME).Pfl>; +} + +multiclass VOP3Interp_Real_vi<bits<10> op> { + def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, + VOP3Interp_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>; +} + } // End AssemblerPredicates = [isVI], DecoderNamespace = "VI" +let AssemblerPredicates = [isVIOnly], DecoderNamespace = "VI" in { + +multiclass VOP3_F16_Real_vi<bits<10> op> { + def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, + VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>; +} + +multiclass VOP3Interp_F16_Real_vi<bits<10> op> { + def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, + VOP3Interp_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl>; +} + +} // End AssemblerPredicates = [isVIOnly], DecoderNamespace = "VI" + +let AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" in { + +multiclass VOP3_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> { + def _vi : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>, + VOP3e_vi <op, !cast<VOP3_Pseudo>(OpName).Pfl> { + VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName); + let AsmString = AsmName # ps.AsmOperands; + } +} + +multiclass VOP3OpSel_F16_Real_gfx9<bits<10> op, string AsmName> { + def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>, + VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(NAME).Pfl> { + VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME); + let AsmString = AsmName # ps.AsmOperands; + } +} + +multiclass VOP3Interp_F16_Real_gfx9<bits<10> op, string OpName, string AsmName> { + def _vi : VOP3_Real<!cast<VOP3_Pseudo>(OpName), SIEncodingFamily.GFX9>, + VOP3Interp_vi <op, !cast<VOP3_Pseudo>(OpName).Pfl> { + VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName); + let AsmString = AsmName # ps.AsmOperands; + } +} + +multiclass VOP3_Real_gfx9<bits<10> op, string AsmName> { + def _vi : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX9>, + VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME).Pfl> { + VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME); + let AsmString = AsmName # ps.AsmOperands; + } +} + +} // End AssemblerPredicates = [isGFX9], DecoderNamespace = "GFX9" + defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>; defm V_MAD_I64_I32 : VOP3be_Real_vi <0x1E9>; @@ -489,18 +774,38 @@ defm V_QSAD_PK_U16_U8 : VOP3_Real_vi <0x1e5>; defm V_MQSAD_PK_U16_U8 : VOP3_Real_vi <0x1e6>; defm V_MQSAD_U32_U8 : VOP3_Real_vi <0x1e7>; -defm V_MAD_F16 : VOP3_Real_vi <0x1ea>; -defm V_MAD_U16 : VOP3_Real_vi <0x1eb>; -defm V_MAD_I16 : VOP3_Real_vi <0x1ec>; - defm V_PERM_B32 : VOP3_Real_vi <0x1ed>; -defm V_FMA_F16 : VOP3_Real_vi <0x1ee>; -defm V_DIV_FIXUP_F16 : VOP3_Real_vi <0x1ef>; - -defm V_INTERP_P1LL_F16 : VOP3_Real_vi <0x274>; -defm V_INTERP_P1LV_F16 : VOP3_Real_vi <0x275>; -defm V_INTERP_P2_F16 : VOP3_Real_vi <0x276>; +defm V_MAD_F16 : VOP3_F16_Real_vi <0x1ea>; +defm V_MAD_U16 : VOP3_F16_Real_vi <0x1eb>; +defm V_MAD_I16 : VOP3_F16_Real_vi <0x1ec>; +defm V_FMA_F16 : VOP3_F16_Real_vi <0x1ee>; +defm V_DIV_FIXUP_F16 : VOP3_F16_Real_vi <0x1ef>; +defm V_INTERP_P2_F16 : VOP3Interp_F16_Real_vi <0x276>; + +defm V_MAD_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ea, "V_MAD_F16", "v_mad_legacy_f16">; +defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">; +defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">; +defm V_FMA_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ee, "V_FMA_F16", "v_fma_legacy_f16">; +defm V_DIV_FIXUP_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ef, "V_DIV_FIXUP_F16", "v_div_fixup_legacy_f16">; +defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16", "v_interp_p2_legacy_f16">; + +defm V_MAD_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">; +defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">; +defm V_MAD_I16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">; +defm V_FMA_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x206, "v_fma_f16">; +defm V_DIV_FIXUP_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x207, "v_div_fixup_f16">; +defm V_INTERP_P2_F16_gfx9 : VOP3Interp_F16_Real_gfx9 <0x277, "V_INTERP_P2_F16_gfx9", "v_interp_p2_f16">; + +defm V_ADD_I32_gfx9 : VOP3_Real_gfx9 <0x29c, "v_add_i32">; +defm V_SUB_I32_gfx9 : VOP3_Real_gfx9 <0x29d, "v_sub_i32">; + +defm V_INTERP_P1_F32_e64 : VOP3Interp_Real_vi <0x270>; +defm V_INTERP_P2_F32_e64 : VOP3Interp_Real_vi <0x271>; +defm V_INTERP_MOV_F32_e64 : VOP3Interp_Real_vi <0x272>; + +defm V_INTERP_P1LL_F16 : VOP3Interp_Real_vi <0x274>; +defm V_INTERP_P1LV_F16 : VOP3Interp_Real_vi <0x275>; defm V_ADD_F64 : VOP3_Real_vi <0x280>; defm V_MUL_F64 : VOP3_Real_vi <0x281>; defm V_MIN_F64 : VOP3_Real_vi <0x282>; @@ -527,18 +832,27 @@ defm V_ADD3_U32 : VOP3_Real_vi <0x1ff>; defm V_LSHL_OR_B32 : VOP3_Real_vi <0x200>; defm V_AND_OR_B32 : VOP3_Real_vi <0x201>; defm V_OR3_B32 : VOP3_Real_vi <0x202>; -defm V_PACK_B32_F16 : VOP3_Real_vi <0x2a0>; +defm V_PACK_B32_F16 : VOP3OpSel_Real_gfx9 <0x2a0>; defm V_XAD_U32 : VOP3_Real_vi <0x1f3>; -defm V_MIN3_F16 : VOP3_Real_vi <0x1f4>; -defm V_MIN3_I16 : VOP3_Real_vi <0x1f5>; -defm V_MIN3_U16 : VOP3_Real_vi <0x1f6>; +defm V_MIN3_F16 : VOP3OpSel_Real_gfx9 <0x1f4>; +defm V_MIN3_I16 : VOP3OpSel_Real_gfx9 <0x1f5>; +defm V_MIN3_U16 : VOP3OpSel_Real_gfx9 <0x1f6>; + +defm V_MAX3_F16 : VOP3OpSel_Real_gfx9 <0x1f7>; +defm V_MAX3_I16 : VOP3OpSel_Real_gfx9 <0x1f8>; +defm V_MAX3_U16 : VOP3OpSel_Real_gfx9 <0x1f9>; + +defm V_MED3_F16 : VOP3OpSel_Real_gfx9 <0x1fa>; +defm V_MED3_I16 : VOP3OpSel_Real_gfx9 <0x1fb>; +defm V_MED3_U16 : VOP3OpSel_Real_gfx9 <0x1fc>; + +defm V_ADD_I16 : VOP3OpSel_Real_gfx9 <0x29e>; +defm V_SUB_I16 : VOP3OpSel_Real_gfx9 <0x29f>; -defm V_MAX3_F16 : VOP3_Real_vi <0x1f7>; -defm V_MAX3_I16 : VOP3_Real_vi <0x1f8>; -defm V_MAX3_U16 : VOP3_Real_vi <0x1f9>; +defm V_MAD_U32_U16 : VOP3OpSel_Real_gfx9 <0x1f1>; +defm V_MAD_I32_I16 : VOP3OpSel_Real_gfx9 <0x1f2>; -defm V_MED3_F16 : VOP3_Real_vi <0x1fa>; -defm V_MED3_I16 : VOP3_Real_vi <0x1fb>; -defm V_MED3_U16 : VOP3_Real_vi <0x1fc>; +defm V_CVT_PKNORM_I16_F16 : VOP3OpSel_Real_gfx9 <0x299>; +defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx9 <0x29a>; diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td index 3becf758aaa3..eeee8b36c175 100644 --- a/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/lib/Target/AMDGPU/VOP3PInstructions.td @@ -18,16 +18,25 @@ class VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> // Non-packed instructions that use the VOP3P encoding. // VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed. -class VOP3_VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> : +class VOP3_VOP3PInst<string OpName, VOPProfile P, bit UseTiedOutput = 0, + SDPatternOperator node = null_frag> : VOP3P_Pseudo<OpName, P> { + // These operands are only sort of f16 operands. Depending on + // op_sel_hi, these may be interpreted as f32. The inline immediate + // values are really f16 converted to f32, so we treat these as f16 + // operands. let InOperandList = - (ins - FP32InputMods:$src0_modifiers, VCSrc_f32:$src0, - FP32InputMods:$src1_modifiers, VCSrc_f32:$src1, - FP32InputMods:$src2_modifiers, VCSrc_f32:$src2, - clampmod:$clamp, - op_sel:$op_sel, - op_sel_hi:$op_sel_hi); + !con( + !con( + (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0, + FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, + FP16InputMods:$src2_modifiers, VCSrc_f16:$src2, + clampmod:$clamp), + !if(UseTiedOutput, (ins VGPR_32:$vdst_in), (ins))), + (ins op_sel:$op_sel, op_sel_hi:$op_sel_hi)); + + let Constraints = !if(UseTiedOutput, "$vdst = $vdst_in", ""); + let DisableEncoding = !if(UseTiedOutput, "$vdst_in", ""); let AsmOperands = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp"; } @@ -59,14 +68,80 @@ def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I1 def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>; def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>; -// XXX - Commutable? + +let SubtargetPredicate = HasMadMixInsts in { // These are VOP3a-like opcodes which accept no omod. // Size of src arguments (16/32) is controlled by op_sel. // For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi. -def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_V2F16_V2F16_V2F16>>; -def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>; -def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>; +let isCommutable = 1 in { +def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; + +// Clamp modifier is applied after conversion to f16. +def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; + +let ClampLo = 0, ClampHi = 1 in { +def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; +} +} + +def : GCNPat < + (f16 (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), + (V_MAD_MIXLO_F16 $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.NONE, + (i32 (IMPLICIT_DEF))) +>; + +// FIXME: Special case handling for maxhi (especially for clamp) +// because dealing with the write to high half of the register is +// difficult. +def : GCNPat < + (build_vector f16:$elt0, (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), + (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.NONE, + $elt0)) +>; + +def : GCNPat < + (build_vector + f16:$elt0, + (AMDGPUclamp (fpround (fmad (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), + (v2f16 (V_MAD_MIXHI_F16 $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.ENABLE, + $elt0)) +>; + +def : GCNPat < + (AMDGPUclamp (build_vector + (fpround (fmad (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))), + (fpround (fmad (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), + (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), + (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))), + (v2f16 (V_MAD_MIXHI_F16 $hi_src0_modifiers, $hi_src0, + $hi_src1_modifiers, $hi_src1, + $hi_src2_modifiers, $hi_src2, + DSTCLAMP.ENABLE, + (V_MAD_MIXLO_F16 $lo_src0_modifiers, $lo_src0, + $lo_src1_modifiers, $lo_src1, + $lo_src2_modifiers, $lo_src2, + DSTCLAMP.ENABLE, + (i32 (IMPLICIT_DEF))))) +>; +} // End SubtargetPredicate = [HasMadMixInsts] multiclass VOP3P_Real_vi<bits<10> op> { def _vi : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.VI>, diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td index b636fc9be431..146870e21531 100644 --- a/lib/Target/AMDGPU/VOPCInstructions.td +++ b/lib/Target/AMDGPU/VOPCInstructions.td @@ -607,9 +607,7 @@ defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">; // V_ICMPIntrinsic Pattern. //===----------------------------------------------------------------------===// -let Predicates = [isGCN] in { - -class ICMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : Pat < +class ICMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : GCNPat < (AMDGPUsetcc vt:$src0, vt:$src1, cond), (inst $src0, $src1) >; @@ -636,7 +634,7 @@ def : ICMP_Pattern <COND_SGE, V_CMP_GE_I64_e64, i64>; def : ICMP_Pattern <COND_SLT, V_CMP_LT_I64_e64, i64>; def : ICMP_Pattern <COND_SLE, V_CMP_LE_I64_e64, i64>; -class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : Pat < +class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : GCNPat < (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), (inst $src0_modifiers, $src0, $src1_modifiers, $src1, @@ -671,8 +669,6 @@ def : FCMP_Pattern <COND_UGE, V_CMP_NLT_F64_e64, f64>; def : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>; def : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>; -} // End Predicates = [isGCN] - //===----------------------------------------------------------------------===// // Target //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td index b47538ba0349..f24ff5ce8dea 100644 --- a/lib/Target/AMDGPU/VOPInstructions.td +++ b/lib/Target/AMDGPU/VOPInstructions.td @@ -65,8 +65,13 @@ class VOP3Common <dag outs, dag ins, string asm = "", } class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [], - bit VOP3Only = 0, bit isVOP3P = 0> : - InstSI <P.Outs64, !if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64), "", pattern>, + bit VOP3Only = 0, bit isVOP3P = 0, bit isVop3OpSel = 0> : + InstSI <P.Outs64, + !if(isVop3OpSel, + P.InsVOP3OpSel, + !if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64)), + "", + pattern>, VOP <opName>, SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>, MnemonicAlias<opName#"_e64", opName> { @@ -74,9 +79,13 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [], let isPseudo = 1; let isCodeGenOnly = 1; let UseNamedOperandTable = 1; + let VOP3_OPSEL = isVop3OpSel; + let IsPacked = P.IsPacked; string Mnemonic = opName; - string AsmOperands = !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64); + string AsmOperands = !if(isVop3OpSel, + P.AsmVOP3OpSel, + !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64)); let Size = 8; let mayLoad = 0; @@ -98,13 +107,17 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [], let VOP3 = 1; let VALU = 1; let FPClamp = P.HasFPClamp; + let IntClamp = P.HasIntClamp; + let ClampLo = P.HasClampLo; + let ClampHi = P.HasClampHi; + let Uses = [EXEC]; let AsmVariantName = AMDGPUAsmVariants.VOP3; let AsmMatchConverter = - !if(!and(P.IsPacked, isVOP3P), + !if(isVOP3P, "cvtVOP3P", - !if(!or(P.HasModifiers, P.HasOMod), + !if(!or(P.HasModifiers, !or(P.HasOMod, P.HasIntClamp)), "cvtVOP3", "")); @@ -146,11 +159,11 @@ class VOP3P_Real<VOP3P_Pseudo ps, int EncodingFamily> : VOP3_Real<ps, EncodingFamily>; class VOP3a<VOPProfile P> : Enc64 { - bits<2> src0_modifiers; + bits<4> src0_modifiers; bits<9> src0; - bits<2> src1_modifiers; + bits<3> src1_modifiers; bits<9> src1; - bits<2> src2_modifiers; + bits<3> src2_modifiers; bits<9> src2; bits<1> clamp; bits<2> omod; @@ -189,6 +202,32 @@ class VOP3e_vi <bits<10> op, VOPProfile P> : VOP3a_vi <op, P> { let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0); } +class VOP3OpSel_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> { + let Inst{11} = !if(P.HasSrc0, src0_modifiers{2}, 0); + let Inst{12} = !if(P.HasSrc1, src1_modifiers{2}, 0); + let Inst{13} = !if(P.HasSrc2, src2_modifiers{2}, 0); + let Inst{14} = !if(P.HasDst, src0_modifiers{3}, 0); +} + +// NB: For V_INTERP* opcodes, src0 is encoded as src1 and vice versa +class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> { + bits<2> attrchan; + bits<6> attr; + bits<1> high; + + let Inst{8} = 0; // No modifiers for src0 + let Inst{61} = 0; + + let Inst{9} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); + let Inst{62} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); + + let Inst{37-32} = attr; + let Inst{39-38} = attrchan; + let Inst{40} = !if(P.HasHigh, high, 0); + + let Inst{49-41} = src0; +} + class VOP3be <VOPProfile P> : Enc64 { bits<8> vdst; bits<2> src0_modifiers; @@ -476,6 +515,8 @@ class VOP_DPP <string OpName, VOPProfile P> : let AssemblerPredicate = !if(P.HasExt, HasDPP, DisableInst); let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP, AMDGPUAsmVariants.Disable); + let Constraints = !if(P.NumSrcArgs, "$old = $vdst", ""); + let DisableEncoding = !if(P.NumSrcArgs, "$old", ""); let DecoderNamespace = "DPP"; } |
