From 044eb2f6afba375a914ac9d8024f8f5142bb912e Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Mon, 18 Dec 2017 20:10:56 +0000 Subject: Vendor import of llvm trunk r321017: https://llvm.org/svn/llvm-project/llvm/trunk@321017 --- lib/Target/AMDGPU/AMDGPU.h | 129 +- lib/Target/AMDGPU/AMDGPU.td | 134 +- lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp | 22 +- lib/Target/AMDGPU/AMDGPUAliasAnalysis.h | 25 +- lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp | 16 +- lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp | 29 +- lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp | 131 ++ lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h | 177 ++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 524 +++++- lib/Target/AMDGPU/AMDGPUAsmPrinter.h | 31 +- lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 40 +- lib/Target/AMDGPU/AMDGPUCallingConv.td | 4 + lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 47 +- lib/Target/AMDGPU/AMDGPUFrameLowering.h | 6 +- lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def | 4 - lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 686 +++++--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 555 ++++-- lib/Target/AMDGPU/AMDGPUISelLowering.h | 61 +- lib/Target/AMDGPU/AMDGPUInline.cpp | 208 +++ lib/Target/AMDGPU/AMDGPUInstrInfo.cpp | 33 +- lib/Target/AMDGPU/AMDGPUInstrInfo.h | 7 +- lib/Target/AMDGPU/AMDGPUInstrInfo.td | 24 + lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 3 +- lib/Target/AMDGPU/AMDGPUInstructionSelector.h | 3 +- lib/Target/AMDGPU/AMDGPUInstructions.td | 373 ++--- lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 12 +- lib/Target/AMDGPU/AMDGPULibCalls.cpp | 1770 ++++++++++++++++++++ lib/Target/AMDGPU/AMDGPULibFunc.cpp | 1054 ++++++++++++ lib/Target/AMDGPU/AMDGPULibFunc.h | 459 +++++ lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 26 +- lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp | 354 ++-- lib/Target/AMDGPU/AMDGPUMachineFunction.cpp | 2 +- lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp | 29 + lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h | 97 ++ .../AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp | 135 ++ .../AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp | 55 +- lib/Target/AMDGPU/AMDGPUPTNote.h | 8 +- lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 11 +- lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 8 +- lib/Target/AMDGPU/AMDGPURegisterInfo.cpp | 9 +- lib/Target/AMDGPU/AMDGPURegisterInfo.td | 2 - lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp | 483 ++++++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 174 +- lib/Target/AMDGPU/AMDGPUSubtarget.h | 127 +- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 203 ++- lib/Target/AMDGPU/AMDGPUTargetMachine.h | 16 +- lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp | 3 +- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 123 +- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h | 59 +- .../AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp | 25 +- lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp | 9 +- lib/Target/AMDGPU/AMDILCFGStructurizer.cpp | 33 +- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 391 +++-- lib/Target/AMDGPU/BUFInstructions.td | 375 +++-- lib/Target/AMDGPU/CIInstructions.td | 15 - lib/Target/AMDGPU/CMakeLists.txt | 66 +- lib/Target/AMDGPU/CaymanInstructions.td | 48 +- lib/Target/AMDGPU/DSInstructions.td | 640 ++++--- .../AMDGPU/Disassembler/AMDGPUDisassembler.cpp | 152 +- .../AMDGPU/Disassembler/AMDGPUDisassembler.h | 28 +- lib/Target/AMDGPU/EvergreenInstructions.td | 89 +- lib/Target/AMDGPU/FLATInstructions.td | 774 +++++++-- lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 248 ++- lib/Target/AMDGPU/GCNHazardRecognizer.h | 22 +- lib/Target/AMDGPU/GCNILPSched.cpp | 364 ++++ lib/Target/AMDGPU/GCNIterativeScheduler.cpp | 100 +- lib/Target/AMDGPU/GCNIterativeScheduler.h | 31 +- lib/Target/AMDGPU/GCNMinRegStrategy.cpp | 32 +- lib/Target/AMDGPU/GCNProcessors.td | 154 ++ lib/Target/AMDGPU/GCNRegPressure.cpp | 39 +- lib/Target/AMDGPU/GCNRegPressure.h | 29 +- lib/Target/AMDGPU/GCNSchedStrategy.cpp | 34 +- .../AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp | 401 +++-- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h | 39 +- .../AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp | 31 +- .../AMDGPUCodeObjectMetadataStreamer.cpp | 432 ----- .../AMDGPUCodeObjectMetadataStreamer.h | 99 -- .../AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp | 21 +- .../AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp | 39 +- lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h | 15 +- .../MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp | 407 +++++ .../MCTargetDesc/AMDGPUHSAMetadataStreamer.h | 96 ++ .../AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp | 17 +- .../AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h | 10 +- .../AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp | 95 +- .../AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h | 38 +- lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt | 2 +- lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp | 2 +- lib/Target/AMDGPU/MIMGInstructions.td | 43 +- lib/Target/AMDGPU/Processors.td | 219 +-- lib/Target/AMDGPU/R600ClauseMergePass.cpp | 17 +- lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp | 28 +- lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp | 32 +- lib/Target/AMDGPU/R600FrameLowering.h | 4 + lib/Target/AMDGPU/R600ISelLowering.cpp | 52 +- lib/Target/AMDGPU/R600InstrFormats.td | 14 +- lib/Target/AMDGPU/R600InstrInfo.cpp | 32 +- lib/Target/AMDGPU/R600InstrInfo.h | 3 + lib/Target/AMDGPU/R600Instructions.td | 138 +- lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp | 42 +- lib/Target/AMDGPU/R600Packetizer.cpp | 10 +- lib/Target/AMDGPU/R600Processors.td | 90 + lib/Target/AMDGPU/R600RegisterInfo.td | 1 + lib/Target/AMDGPU/SIAnnotateControlFlow.cpp | 36 +- lib/Target/AMDGPU/SIDefines.h | 33 +- .../AMDGPU/SIFixControlFlowLiveIntervals.cpp | 88 - lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 192 ++- lib/Target/AMDGPU/SIFixWWMLiveness.cpp | 202 +++ lib/Target/AMDGPU/SIFoldOperands.cpp | 43 +- lib/Target/AMDGPU/SIFrameLowering.cpp | 147 +- lib/Target/AMDGPU/SIFrameLowering.h | 14 + lib/Target/AMDGPU/SIISelLowering.cpp | 1751 ++++++++++++++++--- lib/Target/AMDGPU/SIISelLowering.h | 58 +- lib/Target/AMDGPU/SIInsertSkips.cpp | 125 +- lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 249 ++- lib/Target/AMDGPU/SIInsertWaits.cpp | 21 +- lib/Target/AMDGPU/SIInstrFormats.td | 49 +- lib/Target/AMDGPU/SIInstrInfo.cpp | 636 ++++++- lib/Target/AMDGPU/SIInstrInfo.h | 116 +- lib/Target/AMDGPU/SIInstrInfo.td | 397 ++++- lib/Target/AMDGPU/SIInstructions.td | 547 ++++-- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 532 +++++- lib/Target/AMDGPU/SILowerControlFlow.cpp | 90 +- lib/Target/AMDGPU/SILowerI1Copies.cpp | 7 +- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 189 ++- lib/Target/AMDGPU/SIMachineFunctionInfo.h | 231 ++- lib/Target/AMDGPU/SIMachineScheduler.cpp | 83 +- lib/Target/AMDGPU/SIMachineScheduler.h | 3 + lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 627 +++++++ lib/Target/AMDGPU/SIOptimizeExecMasking.cpp | 65 +- lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp | 252 +++ lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 846 ++++++---- lib/Target/AMDGPU/SIRegisterInfo.cpp | 139 +- lib/Target/AMDGPU/SIRegisterInfo.h | 30 +- lib/Target/AMDGPU/SIRegisterInfo.td | 70 +- lib/Target/AMDGPU/SIShrinkInstructions.cpp | 2 +- lib/Target/AMDGPU/SIWholeQuadMode.cpp | 309 +++- lib/Target/AMDGPU/SMInstructions.td | 40 +- lib/Target/AMDGPU/SOPInstructions.td | 46 +- lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp | 4 +- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 268 ++- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 22 +- lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp | 27 +- lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h | 29 +- lib/Target/AMDGPU/VOP1Instructions.td | 39 +- lib/Target/AMDGPU/VOP2Instructions.td | 227 ++- lib/Target/AMDGPU/VOP3Instructions.td | 432 ++++- lib/Target/AMDGPU/VOP3PInstructions.td | 99 +- lib/Target/AMDGPU/VOPCInstructions.td | 8 +- lib/Target/AMDGPU/VOPInstructions.td | 57 +- 150 files changed, 18883 insertions(+), 5252 deletions(-) create mode 100644 lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp create mode 100644 lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h create mode 100644 lib/Target/AMDGPU/AMDGPUInline.cpp create mode 100644 lib/Target/AMDGPU/AMDGPULibCalls.cpp create mode 100644 lib/Target/AMDGPU/AMDGPULibFunc.cpp create mode 100644 lib/Target/AMDGPU/AMDGPULibFunc.h create mode 100644 lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp create mode 100644 lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h create mode 100644 lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp create mode 100644 lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp delete mode 100644 lib/Target/AMDGPU/CIInstructions.td create mode 100644 lib/Target/AMDGPU/GCNILPSched.cpp create mode 100644 lib/Target/AMDGPU/GCNProcessors.td delete mode 100644 lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp delete mode 100644 lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h create mode 100644 lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.cpp create mode 100644 lib/Target/AMDGPU/MCTargetDesc/AMDGPUHSAMetadataStreamer.h create mode 100644 lib/Target/AMDGPU/R600Processors.td delete mode 100644 lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp create mode 100644 lib/Target/AMDGPU/SIFixWWMLiveness.cpp create mode 100644 lib/Target/AMDGPU/SIMemoryLegalizer.cpp create mode 100644 lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp (limited to 'lib/Target/AMDGPU') diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 568682899be5..0ddc43ad5033 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -23,6 +23,7 @@ class ModulePass; class Pass; class Target; class TargetMachine; +class TargetOptions; class PassRegistry; class Module; @@ -34,6 +35,7 @@ FunctionPass *createR600ClauseMergePass(); FunctionPass *createR600Packetizer(); FunctionPass *createR600ControlFlowFinalizer(); FunctionPass *createAMDGPUCFGStructurizerPass(); +FunctionPass *createR600ISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel); // SI Passes FunctionPass *createSIAnnotateControlFlowPass(); @@ -44,12 +46,20 @@ FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(); FunctionPass *createSIWholeQuadModePass(); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); +FunctionPass *createSIOptimizeExecMaskingPreRAPass(); FunctionPass *createSIFixSGPRCopiesPass(); +FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIDebuggerInsertNopsPass(); FunctionPass *createSIInsertWaitsPass(); FunctionPass *createSIInsertWaitcntsPass(); +FunctionPass *createSIFixWWMLivenessPass(); +FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &); +FunctionPass *createAMDGPUUseNativeCallsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); +FunctionPass *createAMDGPURewriteOutArgumentsPass(); + +void initializeAMDGPUDAGToDAGISelPass(PassRegistry&); void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&); extern char &AMDGPUMachineCFGStructurizerID; @@ -64,6 +74,24 @@ ModulePass *createAMDGPULowerIntrinsicsPass(); void initializeAMDGPULowerIntrinsicsPass(PassRegistry &); extern char &AMDGPULowerIntrinsicsID; +void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &); +extern char &AMDGPURewriteOutArgumentsID; + +void initializeR600ClauseMergePassPass(PassRegistry &); +extern char &R600ClauseMergePassID; + +void initializeR600ControlFlowFinalizerPass(PassRegistry &); +extern char &R600ControlFlowFinalizerID; + +void initializeR600ExpandSpecialInstrsPassPass(PassRegistry &); +extern char &R600ExpandSpecialInstrsPassID; + +void initializeR600VectorRegMergerPass(PassRegistry &); +extern char &R600VectorRegMergerID; + +void initializeR600PacketizerPass(PassRegistry &); +extern char &R600PacketizerID; + void initializeSIFoldOperandsPass(PassRegistry &); extern char &SIFoldOperandsID; @@ -97,14 +125,24 @@ extern char &SIInsertSkipsPassID; void initializeSIOptimizeExecMaskingPass(PassRegistry &); extern char &SIOptimizeExecMaskingID; +void initializeSIFixWWMLivenessPass(PassRegistry &); +extern char &SIFixWWMLivenessID; + +void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &); +extern char &AMDGPUSimplifyLibCallsID; + +void initializeAMDGPUUseNativeCallsPass(PassRegistry &); +extern char &AMDGPUUseNativeCallsID; + // Passes common to R600 and SI FunctionPass *createAMDGPUPromoteAlloca(); void initializeAMDGPUPromoteAllocaPass(PassRegistry&); extern char &AMDGPUPromoteAllocaID; Pass *createAMDGPUStructurizeCFGPass(); -FunctionPass *createAMDGPUISelDag(TargetMachine &TM, - CodeGenOpt::Level OptLevel); +FunctionPass *createAMDGPUISelDag( + TargetMachine *TM = nullptr, + CodeGenOpt::Level OptLevel = CodeGenOpt::Default); ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true); ModulePass *createAMDGPUOpenCLImageTypeLoweringPass(); FunctionPass *createAMDGPUAnnotateUniformValues(); @@ -113,8 +151,8 @@ ModulePass* createAMDGPUUnifyMetadataPass(); void initializeAMDGPUUnifyMetadataPass(PassRegistry&); extern char &AMDGPUUnifyMetadataID; -void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&); -extern char &SIFixControlFlowLiveIntervalsID; +void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry&); +extern char &SIOptimizeExecMaskingPreRAID; void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&); extern char &AMDGPUAnnotateUniformValuesPassID; @@ -125,6 +163,9 @@ extern char &AMDGPUCodeGenPrepareID; void initializeSIAnnotateControlFlowPass(PassRegistry&); extern char &SIAnnotateControlFlowPassID; +void initializeSIMemoryLegalizerPass(PassRegistry&); +extern char &SIMemoryLegalizerID; + void initializeSIDebuggerInsertNopsPass(PassRegistry&); extern char &SIDebuggerInsertNopsID; @@ -140,6 +181,15 @@ extern char &AMDGPUUnifyDivergentExitNodesID; ImmutablePass *createAMDGPUAAWrapperPass(); void initializeAMDGPUAAWrapperPassPass(PassRegistry&); +void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &); + +Pass *createAMDGPUFunctionInliningPass(); +void initializeAMDGPUInlinerPass(PassRegistry&); + +ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass(); +void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &); +extern char &AMDGPUOpenCLEnqueuedBlockLoweringID; + Target &getTheAMDGPUTarget(); Target &getTheGCNTarget(); @@ -167,39 +217,44 @@ struct AMDGPUAS { unsigned FLAT_ADDRESS; ///< Address space for flat memory. unsigned REGION_ADDRESS; ///< Address space for region memory. - // The maximum value for flat, generic, local, private, constant and region. - const static unsigned MAX_COMMON_ADDRESS = 5; - - const static unsigned GLOBAL_ADDRESS = 1; ///< Address space for global memory (RAT0, VTX0). - const static unsigned CONSTANT_ADDRESS = 2; ///< Address space for constant memory (VTX2) - const static unsigned LOCAL_ADDRESS = 3; ///< Address space for local memory. - const static unsigned PARAM_D_ADDRESS = 6; ///< Address space for direct addressible parameter memory (CONST0) - const static unsigned PARAM_I_ADDRESS = 7; ///< Address space for indirect addressible parameter memory (VTX1) - - // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on this - // order to be able to dynamically index a constant buffer, for example: - // - // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx - - const static unsigned CONSTANT_BUFFER_0 = 8; - const static unsigned CONSTANT_BUFFER_1 = 9; - const static unsigned CONSTANT_BUFFER_2 = 10; - const static unsigned CONSTANT_BUFFER_3 = 11; - const static unsigned CONSTANT_BUFFER_4 = 12; - const static unsigned CONSTANT_BUFFER_5 = 13; - const static unsigned CONSTANT_BUFFER_6 = 14; - const static unsigned CONSTANT_BUFFER_7 = 15; - const static unsigned CONSTANT_BUFFER_8 = 16; - const static unsigned CONSTANT_BUFFER_9 = 17; - const static unsigned CONSTANT_BUFFER_10 = 18; - const static unsigned CONSTANT_BUFFER_11 = 19; - const static unsigned CONSTANT_BUFFER_12 = 20; - const static unsigned CONSTANT_BUFFER_13 = 21; - const static unsigned CONSTANT_BUFFER_14 = 22; - const static unsigned CONSTANT_BUFFER_15 = 23; - - // Some places use this if the address space can't be determined. - const static unsigned UNKNOWN_ADDRESS_SPACE = ~0u; + enum : unsigned { + // The maximum value for flat, generic, local, private, constant and region. + MAX_COMMON_ADDRESS = 5, + + GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). + CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2) + LOCAL_ADDRESS = 3, ///< Address space for local memory. + /// Address space for direct addressible parameter memory (CONST0) + PARAM_D_ADDRESS = 6, + /// Address space for indirect addressible parameter memory (VTX1) + PARAM_I_ADDRESS = 7, + + // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on + // this order to be able to dynamically index a constant buffer, for + // example: + // + // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx + + CONSTANT_BUFFER_0 = 8, + CONSTANT_BUFFER_1 = 9, + CONSTANT_BUFFER_2 = 10, + CONSTANT_BUFFER_3 = 11, + CONSTANT_BUFFER_4 = 12, + CONSTANT_BUFFER_5 = 13, + CONSTANT_BUFFER_6 = 14, + CONSTANT_BUFFER_7 = 15, + CONSTANT_BUFFER_8 = 16, + CONSTANT_BUFFER_9 = 17, + CONSTANT_BUFFER_10 = 18, + CONSTANT_BUFFER_11 = 19, + CONSTANT_BUFFER_12 = 20, + CONSTANT_BUFFER_13 = 21, + CONSTANT_BUFFER_14 = 22, + CONSTANT_BUFFER_15 = 23, + + // Some places use this if the address space can't be determined. + UNKNOWN_ADDRESS_SPACE = ~0u, + }; }; namespace llvm { diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index f1d899c4d003..c02d0a131041 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -19,6 +19,12 @@ def FeatureFP64 : SubtargetFeature<"fp64", "Enable double precision operations" >; +def FeatureFMA : SubtargetFeature<"fmaf", + "FMA", + "true", + "Enable single precision FMA (not as fast as mul+add, but fused)" +>; + def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf", "FastFMAF32", "true", @@ -79,6 +85,12 @@ def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts", "Have scratch_* flat memory instructions" >; +def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts", + "AddNoCarryInsts", + "true", + "Have VALU add/sub instructions without carry out" +>; + def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access", "UnalignedBufferAccess", "true", @@ -103,6 +115,12 @@ def FeatureApertureRegs : SubtargetFeature<"aperture-regs", "Has Memory Aperture Base and Size Registers" >; +def FeatureMadMixInsts : SubtargetFeature<"mad-mix-insts", + "HasMadMixInsts", + "true", + "Has v_mad_mix_f32, v_mad_mixlo_f16, v_mad_mixhi_f16 instructions" +>; + // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support // XNACK. The current default kernel driver setting is: // - graphics ring: XNACK disabled @@ -119,7 +137,7 @@ def FeatureXNACK : SubtargetFeature<"xnack", def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", "SGPRInitBug", "true", - "VI SGPR initilization bug requiring a fixed SGPR allocation size" + "VI SGPR initialization bug requiring a fixed SGPR allocation size" >; class SubtargetFeatureFetchLimit : @@ -166,12 +184,6 @@ def FeatureGCN : SubtargetFeature<"gcn", "GCN or newer GPU" >; -def FeatureGCN1Encoding : SubtargetFeature<"gcn1-encoding", - "GCN1Encoding", - "true", - "Encoding format for SI and CI" ->; - def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding", "GCN3Encoding", "true", @@ -181,13 +193,13 @@ def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding", def FeatureCIInsts : SubtargetFeature<"ci-insts", "CIInsts", "true", - "Additional intstructions for CI+" + "Additional instructions for CI+" >; def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts", "GFX9Insts", "true", - "Additional intstructions for GFX9+" + "Additional instructions for GFX9+" >; def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime", @@ -274,6 +286,12 @@ def FeatureDPP : SubtargetFeature<"dpp", "Support DPP (Data Parallel Primitives) extension" >; +def FeatureIntClamp : SubtargetFeature<"int-clamp-insts", + "HasIntClamp", + "true", + "Support clamp for integer destination" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -334,6 +352,13 @@ def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>; def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>; def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>; +def FeatureEnableHugePrivateBuffer : SubtargetFeature< + "huge-private-buffer", + "EnableHugePrivateBuffer", + "true", + "Enable private/scratch buffer sizes greater than 128 GB" +>; + def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", "EnableVGPRSpilling", "true", @@ -402,6 +427,13 @@ def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature < "Hardware automatically inserts waitcnt before barrier" >; +def FeatureCodeObjectV3 : SubtargetFeature < + "code-object-v3", + "CodeObjectV3", + "true", + "Generate code object version 3" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", @@ -436,14 +468,14 @@ def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", [FeatureFP64, FeatureLocalMemorySize32768, - FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding, + FeatureWavefrontSize64, FeatureGCN, FeatureLDSBankCount32, FeatureMovrel] >; def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", [FeatureFP64, FeatureLocalMemorySize65536, FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace, - FeatureGCN1Encoding, FeatureCIInsts, FeatureMovrel] + FeatureCIInsts, FeatureMovrel] >; def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", @@ -452,7 +484,8 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel, FeatureScalarStores, FeatureInv2PiInlineImm, - FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP + FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP, + FeatureIntClamp ] >; @@ -462,9 +495,10 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9", FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode, - FeatureFastFMAF32, FeatureDPP, + FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, - FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts + FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, + FeatureAddNoCarryInsts ] >; @@ -506,6 +540,10 @@ def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3, [FeatureSeaIslands, FeatureLDSBankCount16]>; +def FeatureISAVersion7_0_4 : SubtargetFeatureISAVersion <7,0,4, + [FeatureSeaIslands, + FeatureLDSBankCount32]>; + def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0, [FeatureVolcanicIslands, FeatureLDSBankCount32, @@ -513,6 +551,8 @@ def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0, def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1, [FeatureVolcanicIslands, + FeatureFastFMAF32, + HalfRate64Ops, FeatureLDSBankCount32, FeatureXNACK]>; @@ -525,10 +565,6 @@ def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3, [FeatureVolcanicIslands, FeatureLDSBankCount32]>; -def FeatureISAVersion8_0_4 : SubtargetFeatureISAVersion <8,0,4, - [FeatureVolcanicIslands, - FeatureLDSBankCount32]>; - def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0, [FeatureVolcanicIslands, FeatureLDSBankCount16, @@ -536,21 +572,15 @@ def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0, def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0, [FeatureGFX9, - FeatureLDSBankCount32]>; - -def FeatureISAVersion9_0_1 : SubtargetFeatureISAVersion <9,0,1, - [FeatureGFX9, - FeatureLDSBankCount32, - FeatureXNACK]>; + FeatureMadMixInsts, + FeatureLDSBankCount32 + ]>; def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2, [FeatureGFX9, - FeatureLDSBankCount32]>; - -def FeatureISAVersion9_0_3 : SubtargetFeatureISAVersion <9,0,3, - [FeatureGFX9, - FeatureLDSBankCount32, - FeatureXNACK]>; + FeatureMadMixInsts, + FeatureLDSBankCount32 + ]>; //===----------------------------------------------------------------------===// // Debugger related subtarget features. @@ -660,7 +690,7 @@ def TruePredicate : Predicate<"true">; def isSICI : Predicate< "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" ->, AssemblerPredicate<"FeatureGCN1Encoding">; +>, AssemblerPredicate<"!FeatureGCN3Encoding">; def isVI : Predicate < "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, @@ -680,6 +710,23 @@ def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">, def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">, AssemblerPredicate<"FeatureFlatGlobalInsts">; +def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">, + AssemblerPredicate<"FeatureFlatScratchInsts">; +def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">, + AssemblerPredicate<"FeatureGFX9Insts">; + + +def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">; +def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">; + +def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">, + AssemblerPredicate<"FeatureGFX9Insts">; + +def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarryInsts()">, + AssemblerPredicate<"FeatureAddNoCarryInsts">; + +def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarryInsts()">, + AssemblerPredicate<"!FeatureAddNoCarryInsts">; def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">, AssemblerPredicate<"Feature16BitInsts">; @@ -695,22 +742,41 @@ def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">, def HasDPP : Predicate<"Subtarget->hasDPP()">, AssemblerPredicate<"FeatureDPP">; +def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">, + AssemblerPredicate<"FeatureIntClamp">; + +def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">, + AssemblerPredicate<"FeatureMadMixInsts">; + +def EnableLateCFGStructurize : Predicate< + "EnableLateStructurizeCFG">; + +// Exists to help track down where SubtargetPredicate isn't set rather +// than letting tablegen crash with an unhelpful error. +def InvalidPred : Predicate<"predicate not set on instruction or pattern">; + class PredicateControl { - Predicate SubtargetPredicate; + Predicate SubtargetPredicate = InvalidPred; Predicate SIAssemblerPredicate = isSICI; Predicate VIAssemblerPredicate = isVI; list AssemblerPredicates = []; Predicate AssemblerPredicate = TruePredicate; list OtherPredicates = []; - list Predicates = !listconcat([SubtargetPredicate, AssemblerPredicate], + list Predicates = !listconcat([SubtargetPredicate, + AssemblerPredicate], AssemblerPredicates, OtherPredicates); } +class AMDGPUPat : Pat, + PredicateControl; + + // Include AMDGPU TD files include "R600Schedule.td" +include "R600Processors.td" include "SISchedule.td" -include "Processors.td" +include "GCNProcessors.td" include "AMDGPUInstrInfo.td" include "AMDGPUIntrinsics.td" include "AMDGPURegisterInfo.td" diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp index faa424eb0a64..392b011e387c 100644 --- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp +++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -1,4 +1,4 @@ -//===- AMDGPUAliasAnalysis ---------------------------------------*- C++ -*-==// +//===- AMDGPUAliasAnalysis ------------------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -12,13 +12,21 @@ #include "AMDGPUAliasAnalysis.h" #include "AMDGPU.h" +#include "llvm/ADT/Triple.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/CallingConv.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Module.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" #include "llvm/Pass.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include using namespace llvm; @@ -26,6 +34,7 @@ using namespace llvm; // Register this pass... char AMDGPUAAWrapperPass::ID = 0; + INITIALIZE_PASS(AMDGPUAAWrapperPass, "amdgpu-aa", "AMDGPU Address space based Alias Analysis", false, true) @@ -120,8 +129,11 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc, switch (F->getCallingConv()) { default: return AAResultBase::pointsToConstantMemory(Loc, OrLocal); - case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_LS: + case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_ES: case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_VS: case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: case CallingConv::AMDGPU_KERNEL: diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h index 5f8ed9b1f9a3..645a38af753c 100644 --- a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h +++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h @@ -1,4 +1,4 @@ -//===- AMDGPUAliasAnalysis ---------------------------------------*- C++ -*-==// +//===- AMDGPUAliasAnalysis --------------------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -10,17 +10,24 @@ /// This is the AMGPU address space based alias analysis pass. //===----------------------------------------------------------------------===// -#ifndef LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H -#define LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUALIASANALYSIS_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUALIASANALYSIS_H #include "AMDGPU.h" +#include "llvm/ADT/Triple.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" +#include +#include namespace llvm { +class DataLayout; +class MDNode; +class MemoryLocation; + /// A simple AA result that uses TBAA metadata to answer queries. class AMDGPUAAResult : public AAResultBase { friend AAResultBase; @@ -50,7 +57,9 @@ private: class ASAliasRulesTy { public: ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Arch_); + AliasResult getAliasResult(unsigned AS1, unsigned AS2) const; + private: Triple::ArchType Arch; AMDGPUAS AS; @@ -61,10 +70,11 @@ private: /// Analysis pass providing a never-invalidated alias analysis result. class AMDGPUAA : public AnalysisInfoMixin { friend AnalysisInfoMixin; + static char PassID; public: - typedef AMDGPUAAResult Result; + using Result = AMDGPUAAResult; AMDGPUAAResult run(Function &F, AnalysisManager &AM) { return AMDGPUAAResult(F.getParent()->getDataLayout(), @@ -91,12 +101,15 @@ public: Triple(M.getTargetTriple()))); return false; } + bool doFinalization(Module &M) override { Result.reset(); return false; } + void getAnalysisUsage(AnalysisUsage &AU) const override; }; -} -#endif // LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUALIASANALYSIS_H diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index 6f3742ed039b..c27425443abc 100644 --- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -21,6 +21,12 @@ using namespace llvm; namespace { +static cl::opt StressCalls( + "amdgpu-stress-function-calls", + cl::Hidden, + cl::desc("Force all functions to be noinline"), + cl::init(false)); + class AMDGPUAlwaysInline : public ModulePass { bool GlobalOpt; @@ -57,9 +63,13 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) { } } + auto NewAttr = StressCalls ? Attribute::NoInline : Attribute::AlwaysInline; + auto IncompatAttr + = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline; + for (Function &F : M) { if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() && - !F.hasFnAttribute(Attribute::NoInline)) + !F.hasFnAttribute(IncompatAttr)) FuncsToClone.push_back(&F); } @@ -71,8 +81,8 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) { } for (Function &F : M) { - if (F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::NoInline)) { - F.addFnAttr(Attribute::AlwaysInline); + if (F.hasLocalLinkage() && !F.hasFnAttribute(IncompatAttr)) { + F.addFnAttr(NewAttr); } } return false; diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index c68e5861ff25..ce17202f3414 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -1,4 +1,4 @@ -//===-- AMDGPUAnnotateKernelFeaturesPass.cpp ------------------------------===// +//===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===// // // The LLVM Compiler Infrastructure // @@ -14,13 +14,28 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "amdgpu-annotate-kernel-features" @@ -42,6 +57,7 @@ public: bool doInitialization(CallGraph &CG) override; bool runOnSCC(CallGraphSCC &SCC) override; + StringRef getPassName() const override { return "AMDGPU Annotate Kernel Features"; } @@ -58,7 +74,7 @@ public: AMDGPUAS AS); }; -} +} // end anonymous namespace char AMDGPUAnnotateKernelFeatures::ID = 0; @@ -156,8 +172,9 @@ static StringRef intrinsicToAttrName(Intrinsic::ID ID, case Intrinsic::amdgcn_dispatch_id: return "amdgpu-dispatch-id"; case Intrinsic::amdgcn_kernarg_segment_ptr: - case Intrinsic::amdgcn_implicitarg_ptr: return "amdgpu-kernarg-segment-ptr"; + case Intrinsic::amdgcn_implicitarg_ptr: + return "amdgpu-implicitarg-ptr"; case Intrinsic::amdgcn_queue_ptr: case Intrinsic::trap: case Intrinsic::debugtrap: @@ -190,7 +207,8 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee, { "amdgpu-work-group-id-z" }, { "amdgpu-dispatch-ptr" }, { "amdgpu-dispatch-id" }, - { "amdgpu-kernarg-segment-ptr" } + { "amdgpu-kernarg-segment-ptr" }, + { "amdgpu-implicitarg-ptr" } }; if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) @@ -292,7 +310,6 @@ bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { Changed |= addFeatureAttributes(*F); } - return Changed; } diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp new file mode 100644 index 000000000000..dcca3a2fab96 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -0,0 +1,131 @@ +//===----------------------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUArgumentUsageInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-argument-reg-usage-info" + +INITIALIZE_PASS(AMDGPUArgumentUsageInfo, DEBUG_TYPE, + "Argument Register Usage Information Storage", false, true) + +void ArgDescriptor::print(raw_ostream &OS, + const TargetRegisterInfo *TRI) const { + if (!isSet()) { + OS << "\n"; + return; + } + + if (isRegister()) + OS << "Reg " << printReg(getRegister(), TRI) << '\n'; + else + OS << "Stack offset " << getStackOffset() << '\n'; +} + +char AMDGPUArgumentUsageInfo::ID = 0; + +const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::ExternFunctionInfo{}; + +bool AMDGPUArgumentUsageInfo::doInitialization(Module &M) { + return false; +} + +bool AMDGPUArgumentUsageInfo::doFinalization(Module &M) { + ArgInfoMap.clear(); + return false; +} + +void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const { + for (const auto &FI : ArgInfoMap) { + OS << "Arguments for " << FI.first->getName() << '\n' + << " PrivateSegmentBuffer: " << FI.second.PrivateSegmentBuffer + << " DispatchPtr: " << FI.second.DispatchPtr + << " QueuePtr: " << FI.second.QueuePtr + << " KernargSegmentPtr: " << FI.second.KernargSegmentPtr + << " DispatchID: " << FI.second.DispatchID + << " FlatScratchInit: " << FI.second.FlatScratchInit + << " PrivateSegmentSize: " << FI.second.PrivateSegmentSize + << " GridWorkgroupCountX: " << FI.second.GridWorkGroupCountX + << " GridWorkgroupCountY: " << FI.second.GridWorkGroupCountY + << " GridWorkgroupCountZ: " << FI.second.GridWorkGroupCountZ + << " WorkGroupIDX: " << FI.second.WorkGroupIDX + << " WorkGroupIDY: " << FI.second.WorkGroupIDY + << " WorkGroupIDZ: " << FI.second.WorkGroupIDZ + << " WorkGroupInfo: " << FI.second.WorkGroupInfo + << " PrivateSegmentWaveByteOffset: " + << FI.second.PrivateSegmentWaveByteOffset + << " ImplicitBufferPtr: " << FI.second.ImplicitBufferPtr + << " ImplicitArgPtr: " << FI.second.ImplicitArgPtr + << " WorkItemIDX " << FI.second.WorkItemIDX + << " WorkItemIDY " << FI.second.WorkItemIDY + << " WorkItemIDZ " << FI.second.WorkItemIDZ + << '\n'; + } +} + +std::pair +AMDGPUFunctionArgInfo::getPreloadedValue( + AMDGPUFunctionArgInfo::PreloadedValue Value) const { + switch (Value) { + case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER: { + return std::make_pair( + PrivateSegmentBuffer ? &PrivateSegmentBuffer : nullptr, + &AMDGPU::SGPR_128RegClass); + } + case AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR: + return std::make_pair(ImplicitBufferPtr ? &ImplicitBufferPtr : nullptr, + &AMDGPU::SGPR_64RegClass); + case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: + return std::make_pair(WorkGroupIDX ? &WorkGroupIDX : nullptr, + &AMDGPU::SGPR_32RegClass); + + case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y: + return std::make_pair(WorkGroupIDY ? &WorkGroupIDY : nullptr, + &AMDGPU::SGPR_32RegClass); + case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: + return std::make_pair(WorkGroupIDZ ? &WorkGroupIDZ : nullptr, + &AMDGPU::SGPR_32RegClass); + case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: + return std::make_pair( + PrivateSegmentWaveByteOffset ? &PrivateSegmentWaveByteOffset : nullptr, + &AMDGPU::SGPR_32RegClass); + case AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR: + return std::make_pair(KernargSegmentPtr ? &KernargSegmentPtr : nullptr, + &AMDGPU::SGPR_64RegClass); + case AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR: + return std::make_pair(ImplicitArgPtr ? &ImplicitArgPtr : nullptr, + &AMDGPU::SGPR_64RegClass); + case AMDGPUFunctionArgInfo::DISPATCH_ID: + return std::make_pair(DispatchID ? &DispatchID : nullptr, + &AMDGPU::SGPR_64RegClass); + case AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT: + return std::make_pair(FlatScratchInit ? &FlatScratchInit : nullptr, + &AMDGPU::SGPR_64RegClass); + case AMDGPUFunctionArgInfo::DISPATCH_PTR: + return std::make_pair(DispatchPtr ? &DispatchPtr : nullptr, + &AMDGPU::SGPR_64RegClass); + case AMDGPUFunctionArgInfo::QUEUE_PTR: + return std::make_pair(QueuePtr ? &QueuePtr : nullptr, + &AMDGPU::SGPR_64RegClass); + case AMDGPUFunctionArgInfo::WORKITEM_ID_X: + return std::make_pair(WorkItemIDX ? &WorkItemIDX : nullptr, + &AMDGPU::VGPR_32RegClass); + case AMDGPUFunctionArgInfo::WORKITEM_ID_Y: + return std::make_pair(WorkItemIDY ? &WorkItemIDY : nullptr, + &AMDGPU::VGPR_32RegClass); + case AMDGPUFunctionArgInfo::WORKITEM_ID_Z: + return std::make_pair(WorkItemIDZ ? &WorkItemIDZ : nullptr, + &AMDGPU::VGPR_32RegClass); + } + llvm_unreachable("unexpected preloaded value type"); +} diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h new file mode 100644 index 000000000000..bf9635549a8c --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -0,0 +1,177 @@ +//==- AMDGPUArgumentrUsageInfo.h - Function Arg Usage Info -------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/IR/Function.h" +#include "llvm/Pass.h" + +namespace llvm { + +class Function; +class raw_ostream; +class SISubtarget; +class TargetMachine; +class TargetRegisterClass; +class TargetRegisterInfo; + +struct ArgDescriptor { +private: + friend struct AMDGPUFunctionArgInfo; + friend class AMDGPUArgumentUsageInfo; + + union { + unsigned Register; + unsigned StackOffset; + }; + + bool IsStack : 1; + bool IsSet : 1; + + ArgDescriptor(unsigned Val = 0, bool IsStack = false, bool IsSet = false) + : Register(Val), IsStack(IsStack), IsSet(IsSet) {} +public: + static ArgDescriptor createRegister(unsigned Reg) { + return ArgDescriptor(Reg, false, true); + } + + static ArgDescriptor createStack(unsigned Reg) { + return ArgDescriptor(Reg, true, true); + } + + bool isSet() const { + return IsSet; + } + + explicit operator bool() const { + return isSet(); + } + + bool isRegister() const { + return !IsStack; + } + + unsigned getRegister() const { + assert(!IsStack); + return Register; + } + + unsigned getStackOffset() const { + assert(IsStack); + return StackOffset; + } + + void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr) const; +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) { + Arg.print(OS); + return OS; +} + +struct AMDGPUFunctionArgInfo { + enum PreloadedValue { + // SGPRS: + PRIVATE_SEGMENT_BUFFER = 0, + DISPATCH_PTR = 1, + QUEUE_PTR = 2, + KERNARG_SEGMENT_PTR = 3, + DISPATCH_ID = 4, + FLAT_SCRATCH_INIT = 5, + WORKGROUP_ID_X = 10, + WORKGROUP_ID_Y = 11, + WORKGROUP_ID_Z = 12, + PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, + IMPLICIT_BUFFER_PTR = 15, + IMPLICIT_ARG_PTR = 16, + + // VGPRS: + WORKITEM_ID_X = 17, + WORKITEM_ID_Y = 18, + WORKITEM_ID_Z = 19, + FIRST_VGPR_VALUE = WORKITEM_ID_X + }; + + // Kernel input registers setup for the HSA ABI in allocation order. + + // User SGPRs in kernels + // XXX - Can these require argument spills? + ArgDescriptor PrivateSegmentBuffer; + ArgDescriptor DispatchPtr; + ArgDescriptor QueuePtr; + ArgDescriptor KernargSegmentPtr; + ArgDescriptor DispatchID; + ArgDescriptor FlatScratchInit; + ArgDescriptor PrivateSegmentSize; + ArgDescriptor GridWorkGroupCountX; + ArgDescriptor GridWorkGroupCountY; + ArgDescriptor GridWorkGroupCountZ; + + // System SGPRs in kernels. + ArgDescriptor WorkGroupIDX; + ArgDescriptor WorkGroupIDY; + ArgDescriptor WorkGroupIDZ; + ArgDescriptor WorkGroupInfo; + ArgDescriptor PrivateSegmentWaveByteOffset; + + // Pointer with offset from kernargsegmentptr to where special ABI arguments + // are passed to callable functions. + ArgDescriptor ImplicitArgPtr; + + // Input registers for non-HSA ABI + ArgDescriptor ImplicitBufferPtr = 0; + + // VGPRs inputs. These are always v0, v1 and v2 for entry functions. + ArgDescriptor WorkItemIDX; + ArgDescriptor WorkItemIDY; + ArgDescriptor WorkItemIDZ; + + std::pair + getPreloadedValue(PreloadedValue Value) const; +}; + +class AMDGPUArgumentUsageInfo : public ImmutablePass { +private: + static const AMDGPUFunctionArgInfo ExternFunctionInfo; + DenseMap ArgInfoMap; + +public: + static char ID; + + AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + + bool doInitialization(Module &M) override; + bool doFinalization(Module &M) override; + + void print(raw_ostream &OS, const Module *M = nullptr) const override; + + void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo) { + ArgInfoMap[&F] = ArgInfo; + } + + const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const { + auto I = ArgInfoMap.find(&F); + if (I == ArgInfoMap.end()) { + assert(F.isDeclaration()); + return ExternFunctionInfo; + } + + return I->second; + } +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 2247814cfe55..bb628b8c558f 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -32,15 +32,17 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/TargetLoweringObjectFile.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" +#include "llvm/Support/AMDGPUMetadata.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/TargetRegistry.h" -#include "llvm/Target/TargetLoweringObjectFile.h" using namespace llvm; +using namespace llvm::AMDGPU; // TODO: This should get the default rounding mode from the kernel. We just set // the default here, but this could change if the OpenCL rounding mode pragmas @@ -105,28 +107,71 @@ const MCSubtargetInfo* AMDGPUAsmPrinter::getSTI() const { return TM.getMCSubtargetInfo(); } -AMDGPUTargetStreamer& AMDGPUAsmPrinter::getTargetStreamer() const { - return static_cast(*OutStreamer->getTargetStreamer()); +AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const { + if (!OutStreamer) + return nullptr; + return static_cast(OutStreamer->getTargetStreamer()); } void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { - if (TM.getTargetTriple().getOS() != Triple::AMDHSA) + if (TM.getTargetTriple().getArch() != Triple::amdgcn) + return; + + if (TM.getTargetTriple().getOS() != Triple::AMDHSA && + TM.getTargetTriple().getOS() != Triple::AMDPAL) + return; + + if (TM.getTargetTriple().getOS() == Triple::AMDHSA) + HSAMetadataStream.begin(M); + + if (TM.getTargetTriple().getOS() == Triple::AMDPAL) + readPALMetadata(M); + + // Deprecated notes are not emitted for code object v3. + if (IsaInfo::hasCodeObjectV3(getSTI()->getFeatureBits())) return; - AMDGPU::IsaInfo::IsaVersion ISA = - AMDGPU::IsaInfo::getIsaVersion(getSTI()->getFeatureBits()); + // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2. + if (TM.getTargetTriple().getOS() == Triple::AMDHSA) + getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1); - getTargetStreamer().EmitDirectiveHSACodeObjectVersion(2, 1); - getTargetStreamer().EmitDirectiveHSACodeObjectISA( + // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2. + IsaInfo::IsaVersion ISA = IsaInfo::getIsaVersion(getSTI()->getFeatureBits()); + getTargetStreamer()->EmitDirectiveHSACodeObjectISA( ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU"); - getTargetStreamer().EmitStartOfCodeObjectMetadata(M); } void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { - if (TM.getTargetTriple().getOS() != Triple::AMDHSA) + if (TM.getTargetTriple().getArch() != Triple::amdgcn) + return; + + // Following code requires TargetStreamer to be present. + if (!getTargetStreamer()) return; - getTargetStreamer().EmitEndOfCodeObjectMetadata(); + // Emit ISA Version (NT_AMD_AMDGPU_ISA). + std::string ISAVersionString; + raw_string_ostream ISAVersionStream(ISAVersionString); + IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream); + getTargetStreamer()->EmitISAVersion(ISAVersionStream.str()); + + // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA). + if (TM.getTargetTriple().getOS() == Triple::AMDHSA) { + HSAMetadataStream.end(); + getTargetStreamer()->EmitHSAMetadata(HSAMetadataStream.getHSAMetadata()); + } + + // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA). + if (TM.getTargetTriple().getOS() == Triple::AMDPAL) { + // Copy the PAL metadata from the map where we collected it into a vector, + // then write it as a .note. + PALMD::Metadata PALMetadataVector; + for (auto i : PALMetadataMap) { + PALMetadataVector.push_back(i.first); + PALMetadataVector.push_back(i.second); + } + getTargetStreamer()->EmitPALMetadata(PALMetadataVector); + } } bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough( @@ -154,13 +199,15 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() { getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF); OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); - getTargetStreamer().EmitAMDKernelCodeT(KernelCode); + getTargetStreamer()->EmitAMDKernelCodeT(KernelCode); } if (TM.getTargetTriple().getOS() != Triple::AMDHSA) return; - getTargetStreamer().EmitKernelCodeObjectMetadata(*MF->getFunction(), - KernelCode); + + HSAMetadataStream.emitKernel(MF->getFunction(), + getHSACodeProps(*MF, CurrentProgramInfo), + getHSADebugProps(*MF, CurrentProgramInfo)); } void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { @@ -168,18 +215,38 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { const AMDGPUSubtarget &STM = MF->getSubtarget(); if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(*MF)) { SmallString<128> SymbolName; - getNameWithPrefix(SymbolName, MF->getFunction()), - getTargetStreamer().EmitAMDGPUSymbolType( + getNameWithPrefix(SymbolName, &MF->getFunction()), + getTargetStreamer()->EmitAMDGPUSymbolType( SymbolName, ELF::STT_AMDGPU_HSA_KERNEL); } + const AMDGPUSubtarget &STI = MF->getSubtarget(); + if (STI.dumpCode()) { + // Disassemble function name label to text. + DisasmLines.push_back(MF->getName().str() + ":"); + DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); + HexLines.push_back(""); + } AsmPrinter::EmitFunctionEntryLabel(); } +void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const { + const AMDGPUSubtarget &STI = MBB.getParent()->getSubtarget(); + if (STI.dumpCode() && !isBlockOnlyReachableByFallthrough(&MBB)) { + // Write a line for the basic block label if it is not only fallthrough. + DisasmLines.push_back( + (Twine("BB") + Twine(getFunctionNumber()) + + "_" + Twine(MBB.getNumber()) + ":").str()); + DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size()); + HexLines.push_back(""); + } + AsmPrinter::EmitBasicBlockStart(MBB); +} + void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { // Group segment variables aren't emitted in HSA. - if (AMDGPU::isGroupSegment(GV, AMDGPUASI)) + if (AMDGPU::isGroupSegment(GV)) return; AsmPrinter::EmitGlobalVariable(GV); @@ -190,11 +257,32 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) { return AsmPrinter::doFinalization(M); } +// For the amdpal OS type, read the amdgpu.pal.metadata supplied by the +// frontend into our PALMetadataMap, ready for per-function modification. It +// is a NamedMD containing an MDTuple containing a number of MDNodes each of +// which is an integer value, and each two integer values forms a key=value +// pair that we store as PALMetadataMap[key]=value in the map. +void AMDGPUAsmPrinter::readPALMetadata(Module &M) { + auto NamedMD = M.getNamedMetadata("amdgpu.pal.metadata"); + if (!NamedMD || !NamedMD->getNumOperands()) + return; + auto Tuple = dyn_cast(NamedMD->getOperand(0)); + if (!Tuple) + return; + for (unsigned I = 0, E = Tuple->getNumOperands() & -2; I != E; I += 2) { + auto Key = mdconst::dyn_extract(Tuple->getOperand(I)); + auto Val = mdconst::dyn_extract(Tuple->getOperand(I + 1)); + if (!Key || !Val) + continue; + PALMetadataMap[Key->getZExtValue()] = Val->getZExtValue(); + } +} + // Print comments that apply to both callable functions and entry points. void AMDGPUAsmPrinter::emitCommonFunctionComments( uint32_t NumVGPR, uint32_t NumSGPR, - uint32_t ScratchSize, + uint64_t ScratchSize, uint64_t CodeSize) { OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false); @@ -226,12 +314,14 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { getSIProgramInfo(CurrentProgramInfo, MF); } else { auto I = CallGraphResourceInfo.insert( - std::make_pair(MF.getFunction(), SIFunctionResourceInfo())); + std::make_pair(&MF.getFunction(), SIFunctionResourceInfo())); SIFunctionResourceInfo &Info = I.first->second; assert(I.second && "should only be called once per function"); Info = analyzeResourceUsage(MF); } + if (STM.isAmdPalOS()) + EmitPALMetadata(MF, CurrentProgramInfo); if (!STM.isAmdHsaOS()) { EmitProgramInfoSI(MF, CurrentProgramInfo); } @@ -253,7 +343,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { if (!MFI->isEntryFunction()) { OutStreamer->emitRawComment(" Function info:", false); - SIFunctionResourceInfo &Info = CallGraphResourceInfo[MF.getFunction()]; + SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()]; emitCommonFunctionComments( Info.NumVGPR, Info.getTotalNumSGPRs(MF.getSubtarget()), @@ -336,8 +426,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0)); for (size_t i = 0; i < DisasmLines.size(); ++i) { - std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' '); - Comment += " ; " + HexLines[i] + "\n"; + std::string Comment = "\n"; + if (!HexLines[i].empty()) { + Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' '); + Comment += " ; " + HexLines[i] + "\n"; + } OutStreamer->EmitBytes(StringRef(DisasmLines[i])); OutStreamer->EmitBytes(StringRef(Comment)); @@ -376,7 +469,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { unsigned RsrcReg; if (STM.getGeneration() >= R600Subtarget::EVERGREEN) { // Evergreen / Northern Islands - switch (MF.getFunction()->getCallingConv()) { + switch (MF.getFunction().getCallingConv()) { default: LLVM_FALLTHROUGH; case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; @@ -385,7 +478,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { } } else { // R600 / R700 - switch (MF.getFunction()->getCallingConv()) { + switch (MF.getFunction().getCallingConv()) { default: LLVM_FALLTHROUGH; case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH; case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH; @@ -400,7 +493,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); - if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { + if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4); } @@ -500,29 +593,184 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( // If there are no calls, MachineRegisterInfo can tell us the used register // count easily. + // A tail call isn't considered a call for MachineFrameInfo's purposes. + if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { + MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestVGPRReg = Reg; + break; + } + } - MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestVGPRReg = Reg; - break; + MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestSGPRReg = Reg; + break; + } } + + // We found the maximum register index. They start at 0, so add one to get the + // number of registers. + Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 : + TRI.getHWRegIndex(HighestVGPRReg) + 1; + Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 : + TRI.getHWRegIndex(HighestSGPRReg) + 1; + + return Info; } - MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - HighestSGPRReg = Reg; - break; + int32_t MaxVGPR = -1; + int32_t MaxSGPR = -1; + uint64_t CalleeFrameSize = 0; + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + // TODO: Check regmasks? Do they occur anywhere except calls? + for (const MachineOperand &MO : MI.operands()) { + unsigned Width = 0; + bool IsSGPR = false; + + if (!MO.isReg()) + continue; + + unsigned Reg = MO.getReg(); + switch (Reg) { + case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: + case AMDGPU::SCC: + case AMDGPU::M0: + case AMDGPU::SRC_SHARED_BASE: + case AMDGPU::SRC_SHARED_LIMIT: + case AMDGPU::SRC_PRIVATE_BASE: + case AMDGPU::SRC_PRIVATE_LIMIT: + continue; + + case AMDGPU::NoRegister: + assert(MI.isDebugValue()); + continue; + + case AMDGPU::VCC: + case AMDGPU::VCC_LO: + case AMDGPU::VCC_HI: + Info.UsesVCC = true; + continue; + + case AMDGPU::FLAT_SCR: + case AMDGPU::FLAT_SCR_LO: + case AMDGPU::FLAT_SCR_HI: + continue; + + case AMDGPU::TBA: + case AMDGPU::TBA_LO: + case AMDGPU::TBA_HI: + case AMDGPU::TMA: + case AMDGPU::TMA_LO: + case AMDGPU::TMA_HI: + llvm_unreachable("trap handler registers should not be used"); + + default: + break; + } + + if (AMDGPU::SReg_32RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_32RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 1; + } else if (AMDGPU::VGPR_32RegClass.contains(Reg)) { + IsSGPR = false; + Width = 1; + } else if (AMDGPU::SReg_64RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_64RegClass.contains(Reg) && + "trap handler registers should not be used"); + IsSGPR = true; + Width = 2; + } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { + IsSGPR = false; + Width = 2; + } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { + IsSGPR = false; + Width = 3; + } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { + IsSGPR = true; + Width = 4; + } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { + IsSGPR = false; + Width = 4; + } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { + IsSGPR = true; + Width = 8; + } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { + IsSGPR = false; + Width = 8; + } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { + IsSGPR = true; + Width = 16; + } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { + IsSGPR = false; + Width = 16; + } else { + llvm_unreachable("Unknown register class"); + } + unsigned HWReg = TRI.getHWRegIndex(Reg); + int MaxUsed = HWReg + Width - 1; + if (IsSGPR) { + MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; + } else { + MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; + } + } + + if (MI.isCall()) { + // Pseudo used just to encode the underlying global. Is there a better + // way to track this? + + const MachineOperand *CalleeOp + = TII->getNamedOperand(MI, AMDGPU::OpName::callee); + const Function *Callee = cast(CalleeOp->getGlobal()); + if (Callee->isDeclaration()) { + // If this is a call to an external function, we can't do much. Make + // conservative guesses. + + // 48 SGPRs - vcc, - flat_scr, -xnack + int MaxSGPRGuess = 47 - getNumExtraSGPRs(ST, true, + ST.hasFlatAddressSpace()); + MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess); + MaxVGPR = std::max(MaxVGPR, 23); + + CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384)); + Info.UsesVCC = true; + Info.UsesFlatScratch = ST.hasFlatAddressSpace(); + Info.HasDynamicallySizedStack = true; + } else { + // We force CodeGen to run in SCC order, so the callee's register + // usage etc. should be the cumulative usage of all callees. + auto I = CallGraphResourceInfo.find(Callee); + assert(I != CallGraphResourceInfo.end() && + "callee should have been handled before caller"); + + MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); + MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); + CalleeFrameSize + = std::max(I->second.PrivateSegmentSize, CalleeFrameSize); + Info.UsesVCC |= I->second.UsesVCC; + Info.UsesFlatScratch |= I->second.UsesFlatScratch; + Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack; + Info.HasRecursion |= I->second.HasRecursion; + } + + if (!Callee->doesNotRecurse()) + Info.HasRecursion = true; + } } } - // We found the maximum register index. They start at 0, so add one to get the - // number of registers. - Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestVGPRReg) + 1; - Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 : - TRI.getHWRegIndex(HighestSGPRReg) + 1; + Info.NumExplicitSGPR = MaxSGPR + 1; + Info.NumVGPR = MaxVGPR + 1; + Info.PrivateSegmentSize += CalleeFrameSize; return Info; } @@ -538,6 +786,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.FlatUsed = Info.UsesFlatScratch; ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion; + if (!isUInt<32>(ProgInfo.ScratchSize)) { + DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), + ProgInfo.ScratchSize, DS_Error); + MF.getFunction().getContext().diagnose(DiagStackSize); + } + const SISubtarget &STM = MF.getSubtarget(); const SIMachineFunctionInfo *MFI = MF.getInfo(); const SIInstrInfo *TII = STM.getInstrInfo(); @@ -554,8 +808,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs(); if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { // This can happen due to a compiler bug or when using inline asm. - LLVMContext &Ctx = MF.getFunction()->getContext(); - DiagnosticInfoResourceLimit Diag(*MF.getFunction(), + LLVMContext &Ctx = MF.getFunction().getContext(); + DiagnosticInfoResourceLimit Diag(MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR, DS_Error, DK_ResourceLimit, @@ -582,8 +836,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) { // This can happen due to a compiler bug or when using inline asm to use // the registers which are usually reserved for vcc etc. - LLVMContext &Ctx = MF.getFunction()->getContext(); - DiagnosticInfoResourceLimit Diag(*MF.getFunction(), + LLVMContext &Ctx = MF.getFunction().getContext(); + DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers", ProgInfo.NumSGPR, DS_Error, DK_ResourceLimit, @@ -602,15 +856,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, } if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) { - LLVMContext &Ctx = MF.getFunction()->getContext(); - DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs", + LLVMContext &Ctx = MF.getFunction().getContext(); + DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs", MFI->getNumUserSGPRs(), DS_Error); Ctx.diagnose(Diag); } if (MFI->getLDSSize() > static_cast(STM.getLocalMemorySize())) { - LLVMContext &Ctx = MF.getFunction()->getContext(); - DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory", + LLVMContext &Ctx = MF.getFunction().getContext(); + DiagnosticInfoResourceLimit Diag(MF.getFunction(), "local memory", MFI->getLDSSize(), DS_Error); Ctx.diagnose(Diag); } @@ -710,10 +964,12 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) { switch (CallConv) { default: LLVM_FALLTHROUGH; case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; + case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS; case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS; + case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES; case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; - case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; + case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; } } @@ -721,9 +977,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) { const SISubtarget &STM = MF.getSubtarget(); const SIMachineFunctionInfo *MFI = MF.getInfo(); - unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv()); + unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv()); - if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { + if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4); @@ -740,19 +996,24 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer->EmitIntValue(RsrcReg, 4); OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4); - if (STM.isVGPRSpillingEnabled(*MF.getFunction())) { + unsigned Rsrc2Val = 0; + if (STM.isVGPRSpillingEnabled(MF.getFunction())) { OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); + if (TM.getTargetTriple().getOS() == Triple::AMDPAL) + Rsrc2Val = S_00B84C_SCRATCH_EN(CurrentProgramInfo.ScratchBlocks > 0); + } + if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { + OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); + OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4); + OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); + OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); + Rsrc2Val |= S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks); + } + if (Rsrc2Val) { + OutStreamer->EmitIntValue(RsrcReg + 4 /*rsrc2*/, 4); + OutStreamer->EmitIntValue(Rsrc2Val, 4); } - } - - if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) { - OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); - OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4); - OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); - OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4); - OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); - OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); } OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4); @@ -761,6 +1022,75 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4); } +// This is the equivalent of EmitProgramInfoSI above, but for when the OS type +// is AMDPAL. It stores each compute/SPI register setting and other PAL +// metadata items into the PALMetadataMap, combining with any provided by the +// frontend as LLVM metadata. Once all functions are written, PALMetadataMap is +// then written as a single block in the .note section. +void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, + const SIProgramInfo &CurrentProgramInfo) { + const SIMachineFunctionInfo *MFI = MF.getInfo(); + // Given the calling convention, calculate the register number for rsrc1. In + // principle the register number could change in future hardware, but we know + // it is the same for gfx6-9 (except that LS and ES don't exist on gfx9), so + // we can use the same fixed value that .AMDGPU.config has for Mesa. Note + // that we use a register number rather than a byte offset, so we need to + // divide by 4. + unsigned Rsrc1Reg = getRsrcReg(MF.getFunction().getCallingConv()) / 4; + unsigned Rsrc2Reg = Rsrc1Reg + 1; + // Also calculate the PAL metadata key for *S_SCRATCH_SIZE. It can be used + // with a constant offset to access any non-register shader-specific PAL + // metadata key. + unsigned ScratchSizeKey = PALMD::Key::CS_SCRATCH_SIZE; + switch (MF.getFunction().getCallingConv()) { + case CallingConv::AMDGPU_PS: + ScratchSizeKey = PALMD::Key::PS_SCRATCH_SIZE; + break; + case CallingConv::AMDGPU_VS: + ScratchSizeKey = PALMD::Key::VS_SCRATCH_SIZE; + break; + case CallingConv::AMDGPU_GS: + ScratchSizeKey = PALMD::Key::GS_SCRATCH_SIZE; + break; + case CallingConv::AMDGPU_ES: + ScratchSizeKey = PALMD::Key::ES_SCRATCH_SIZE; + break; + case CallingConv::AMDGPU_HS: + ScratchSizeKey = PALMD::Key::HS_SCRATCH_SIZE; + break; + case CallingConv::AMDGPU_LS: + ScratchSizeKey = PALMD::Key::LS_SCRATCH_SIZE; + break; + } + unsigned NumUsedVgprsKey = ScratchSizeKey + + PALMD::Key::VS_NUM_USED_VGPRS - PALMD::Key::VS_SCRATCH_SIZE; + unsigned NumUsedSgprsKey = ScratchSizeKey + + PALMD::Key::VS_NUM_USED_SGPRS - PALMD::Key::VS_SCRATCH_SIZE; + PALMetadataMap[NumUsedVgprsKey] = CurrentProgramInfo.NumVGPRsForWavesPerEU; + PALMetadataMap[NumUsedSgprsKey] = CurrentProgramInfo.NumSGPRsForWavesPerEU; + if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { + PALMetadataMap[Rsrc1Reg] |= CurrentProgramInfo.ComputePGMRSrc1; + PALMetadataMap[Rsrc2Reg] |= CurrentProgramInfo.ComputePGMRSrc2; + // ScratchSize is in bytes, 16 aligned. + PALMetadataMap[ScratchSizeKey] |= + alignTo(CurrentProgramInfo.ScratchSize, 16); + } else { + PALMetadataMap[Rsrc1Reg] |= S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | + S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks); + if (CurrentProgramInfo.ScratchBlocks > 0) + PALMetadataMap[Rsrc2Reg] |= S_00B84C_SCRATCH_EN(1); + // ScratchSize is in bytes, 16 aligned. + PALMetadataMap[ScratchSizeKey] |= + alignTo(CurrentProgramInfo.ScratchSize, 16); + } + if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) { + PALMetadataMap[Rsrc2Reg] |= + S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks); + PALMetadataMap[R_0286CC_SPI_PS_INPUT_ENA / 4] |= MFI->getPSInputEnable(); + PALMetadataMap[R_0286D0_SPI_PS_INPUT_ADDR / 4] |= MFI->getPSInputAddr(); + } +} + // This is supposed to be log2(Size) static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { switch (Size) { @@ -862,23 +1192,81 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, } } +AMDGPU::HSAMD::Kernel::CodeProps::Metadata AMDGPUAsmPrinter::getHSACodeProps( + const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const { + const SISubtarget &STM = MF.getSubtarget(); + const SIMachineFunctionInfo &MFI = *MF.getInfo(); + HSAMD::Kernel::CodeProps::Metadata HSACodeProps; + + HSACodeProps.mKernargSegmentSize = + STM.getKernArgSegmentSize(MF, MFI.getABIArgOffset()); + HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize; + HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize; + HSACodeProps.mKernargSegmentAlign = + std::max(uint32_t(4), MFI.getMaxKernArgAlign()); + HSACodeProps.mWavefrontSize = STM.getWavefrontSize(); + HSACodeProps.mNumSGPRs = CurrentProgramInfo.NumSGPR; + HSACodeProps.mNumVGPRs = CurrentProgramInfo.NumVGPR; + HSACodeProps.mMaxFlatWorkGroupSize = MFI.getMaxFlatWorkGroupSize(); + HSACodeProps.mIsDynamicCallStack = ProgramInfo.DynamicCallStack; + HSACodeProps.mIsXNACKEnabled = STM.isXNACKEnabled(); + HSACodeProps.mNumSpilledSGPRs = MFI.getNumSpilledSGPRs(); + HSACodeProps.mNumSpilledVGPRs = MFI.getNumSpilledVGPRs(); + + return HSACodeProps; +} + +AMDGPU::HSAMD::Kernel::DebugProps::Metadata AMDGPUAsmPrinter::getHSADebugProps( + const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const { + const SISubtarget &STM = MF.getSubtarget(); + HSAMD::Kernel::DebugProps::Metadata HSADebugProps; + + if (!STM.debuggerSupported()) + return HSADebugProps; + + HSADebugProps.mDebuggerABIVersion.push_back(1); + HSADebugProps.mDebuggerABIVersion.push_back(0); + HSADebugProps.mReservedNumVGPRs = ProgramInfo.ReservedVGPRCount; + HSADebugProps.mReservedFirstVGPR = ProgramInfo.ReservedVGPRFirst; + + if (STM.debuggerEmitPrologue()) { + HSADebugProps.mPrivateSegmentBufferSGPR = + ProgramInfo.DebuggerPrivateSegmentBufferSGPR; + HSADebugProps.mWavefrontPrivateSegmentOffsetSGPR = + ProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; + } + + return HSADebugProps; +} + bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { + // First try the generic code, which knows about modifiers like 'c' and 'n'. + if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O)) + return false; + if (ExtraCode && ExtraCode[0]) { if (ExtraCode[1] != 0) return true; // Unknown modifier. switch (ExtraCode[0]) { - default: - // See if this is a generic print operand - return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O); case 'r': break; + default: + return true; } } - AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O, - *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo()); - return false; + // TODO: Should be able to support other operand types like globals. + const MachineOperand &MO = MI->getOperand(OpNo); + if (MO.isReg()) { + AMDGPUInstPrinter::printRegOperand(MO.getReg(), O, + *MF->getSubtarget().getRegisterInfo()); + return false; + } + + return true; } diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 0a58ce06704d..51d48a0c7320 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -17,6 +17,7 @@ #include "AMDGPU.h" #include "AMDKernelCodeT.h" +#include "MCTargetDesc/AMDGPUHSAMetadataStreamer.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/AsmPrinter.h" #include @@ -40,7 +41,7 @@ private: // the end are tracked separately. int32_t NumVGPR = 0; int32_t NumExplicitSGPR = 0; - uint32_t PrivateSegmentSize = 0; + uint64_t PrivateSegmentSize = 0; bool UsesVCC = false; bool UsesFlatScratch = false; bool HasDynamicallySizedStack = false; @@ -60,7 +61,7 @@ private: uint32_t DX10Clamp = 0; uint32_t DebugMode = 0; uint32_t IEEEMode = 0; - uint32_t ScratchSize = 0; + uint64_t ScratchSize = 0; uint64_t ComputePGMRSrc1 = 0; @@ -113,9 +114,13 @@ private: SIProgramInfo CurrentProgramInfo; DenseMap CallGraphResourceInfo; + AMDGPU::HSAMD::MetadataStreamer HSAMetadataStream; + std::map PALMetadataMap; + uint64_t getFunctionCodeSize(const MachineFunction &MF) const; SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const; + void readPALMetadata(Module &M); void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF); void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo, const MachineFunction &MF) const; @@ -123,13 +128,23 @@ private: unsigned &NumSGPR, unsigned &NumVGPR) const; + AMDGPU::HSAMD::Kernel::CodeProps::Metadata getHSACodeProps( + const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const; + AMDGPU::HSAMD::Kernel::DebugProps::Metadata getHSADebugProps( + const MachineFunction &MF, + const SIProgramInfo &ProgramInfo) const; + /// \brief Emit register usage information so that the GPU driver /// can correctly setup the GPU state. void EmitProgramInfoR600(const MachineFunction &MF); - void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo); + void EmitProgramInfoSI(const MachineFunction &MF, + const SIProgramInfo &KernelInfo); + void EmitPALMetadata(const MachineFunction &MF, + const SIProgramInfo &KernelInfo); void emitCommonFunctionComments(uint32_t NumVGPR, uint32_t NumSGPR, - uint32_t ScratchSize, + uint64_t ScratchSize, uint64_t CodeSize); public: @@ -140,7 +155,7 @@ public: const MCSubtargetInfo* getSTI() const; - AMDGPUTargetStreamer& getTargetStreamer() const; + AMDGPUTargetStreamer* getTargetStreamer() const; bool doFinalization(Module &M) override; bool runOnMachineFunction(MachineFunction &MF) override; @@ -166,6 +181,8 @@ public: void EmitFunctionEntryLabel() override; + void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override; + void EmitGlobalVariable(const GlobalVariable *GV) override; void EmitStartOfAsmFile(Module &M) override; @@ -180,8 +197,8 @@ public: raw_ostream &O) override; protected: - std::vector DisasmLines, HexLines; - size_t DisasmLineMaxLen; + mutable std::vector DisasmLines, HexLines; + mutable size_t DisasmLineMaxLen; AMDGPUAS AMDGPUASI; }; diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 515cc07dd449..5a9138731934 100644 --- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -26,10 +26,6 @@ using namespace llvm; -#ifndef LLVM_BUILD_GLOBAL_ISEL -#error "This shouldn't be built without GISel" -#endif - AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) : CallLowering(&TLI), AMDGPUASI(TLI.getAMDGPUAS()) { } @@ -45,15 +41,15 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder, unsigned Offset) const { MachineFunction &MF = MIRBuilder.getMF(); - const SIRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); - const Function &F = *MF.getFunction(); + const Function &F = MF.getFunction(); const DataLayout &DL = F.getParent()->getDataLayout(); PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS); LLT PtrType = getLLTForType(*PtrTy, DL); unsigned DstReg = MRI.createGenericVirtualRegister(PtrType); unsigned KernArgSegmentPtr = - TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); unsigned KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); unsigned OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); @@ -68,7 +64,7 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy, unsigned Offset, unsigned DstReg) const { MachineFunction &MF = MIRBuilder.getMF(); - const Function &F = *MF.getFunction(); + const Function &F = MF.getFunction(); const DataLayout &DL = F.getParent()->getDataLayout(); PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); @@ -144,18 +140,38 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, Function::const_arg_iterator CurOrigArg = F.arg_begin(); const AMDGPUTargetLowering &TLI = *getTLI(); for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) { - MVT ValVT = TLI.getValueType(DL, CurOrigArg->getType()).getSimpleVT(); + EVT ValEVT = TLI.getValueType(DL, CurOrigArg->getType()); + + // We can only hanlde simple value types at the moment. + if (!ValEVT.isSimple()) + return false; + MVT ValVT = ValEVT.getSimpleVT(); ISD::ArgFlagsTy Flags; + ArgInfo OrigArg{VRegs[i], CurOrigArg->getType()}; + setArgFlags(OrigArg, i + 1, DL, F); Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType())); CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false); bool Res = - AssignFn(i, ValVT, ValVT, CCValAssign::Full, Flags, CCInfo); - assert(!Res && "Call operand has unhandled type"); - (void)Res; + AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo); + + // Fail if we don't know how to handle this type. + if (Res) + return false; } Function::const_arg_iterator Arg = F.arg_begin(); + + if (F.getCallingConv() == CallingConv::AMDGPU_VS) { + for (unsigned i = 0; i != NumArgs; ++i, ++Arg) { + CCValAssign &VA = ArgLocs[i]; + MRI.addLiveIn(VA.getLocReg(), VRegs[i]); + MIRBuilder.getMBB().addLiveIn(VA.getLocReg()); + MIRBuilder.buildCopy(VRegs[i], VA.getLocReg()); + } + return true; + } + for (unsigned i = 0; i != NumArgs; ++i, ++Arg) { // FIXME: We should be getting DebugInfo from the arguments some how. CCValAssign &VA = ArgLocs[i]; diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td index 4bef7a89bfe3..c1c066fd1404 100644 --- a/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -162,6 +162,10 @@ def CC_AMDGPU : CallingConv<[ "(State.getMachineFunction().getSubtarget()).getGeneration() >= " "AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo>, + CCIf<"static_cast" + "(State.getMachineFunction().getSubtarget()).getGeneration() >= " + "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C", + CCDelegateTo>, CCIf<"static_cast" "(State.getMachineFunction().getSubtarget()).getGeneration() < " "AMDGPUSubtarget::SOUTHERN_ISLANDS", diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 31ee9206ae27..b17b67167666 100644 --- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -18,6 +18,7 @@ #include "AMDGPUTargetMachine.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/Loads.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" @@ -53,6 +54,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass, DivergenceAnalysis *DA = nullptr; Module *Mod = nullptr; bool HasUnsafeFPMath = false; + AMDGPUAS AMDGPUASI; /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to /// binary operation \p V. @@ -123,6 +125,15 @@ class AMDGPUCodeGenPrepare : public FunctionPass, /// /// \returns True. bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; + /// \brief Widen a scalar load. + /// + /// \details \p Widen scalar load for uniform, small type loads from constant + // memory / to a full 32-bits and then truncate the input to allow a scalar + // load instead of a vector load. + // + /// \returns True. + + bool canWidenScalarExtLoad(LoadInst &I) const; public: static char ID; @@ -133,6 +144,7 @@ public: bool visitInstruction(Instruction &I) { return false; } bool visitBinaryOperator(BinaryOperator &I); + bool visitLoadInst(LoadInst &I); bool visitICmpInst(ICmpInst &I); bool visitSelectInst(SelectInst &I); @@ -223,6 +235,16 @@ static bool promotedOpIsNUW(const Instruction &I) { } } +bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const { + Type *Ty = I.getType(); + const DataLayout &DL = Mod->getDataLayout(); + int TySize = DL.getTypeSizeInBits(Ty); + unsigned Align = I.getAlignment() ? + I.getAlignment() : DL.getABITypeAlignment(Ty); + + return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I); +} + bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { assert(needsPromotionToI32(I.getType()) && "I does not need promotion to i32"); @@ -378,7 +400,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { return false; FastMathFlags FMF = FPOp->getFastMathFlags(); - bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || + bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() || FMF.allowReciprocal(); // With UnsafeDiv node will be optimized to just rcp and mul. @@ -443,6 +465,29 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { return Changed; } +bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { + if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && + canWidenScalarExtLoad(I)) { + IRBuilder<> Builder(&I); + Builder.SetCurrentDebugLocation(I.getDebugLoc()); + + Type *I32Ty = Builder.getInt32Ty(); + Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); + Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); + Value *WidenLoad = Builder.CreateLoad(BitCast); + + int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); + Type *IntNTy = Builder.getIntNTy(TySize); + Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); + Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); + I.replaceAllUsesWith(ValOrig); + I.eraseFromParent(); + return true; + } + + return false; +} + bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { bool Changed = false; diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h index 8e187c7e56c1..91fe921bfeec 100644 --- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -15,7 +15,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { @@ -33,10 +33,6 @@ public: /// \returns The number of 32-bit sub-registers that are used when storing /// values to the stack. unsigned getStackWidth(const MachineFunction &MF) const; - - bool hasFP(const MachineFunction &MF) const override { - return false; - } }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def index 5cb9036f4823..bf7deb500d1a 100644 --- a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def +++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def @@ -11,10 +11,6 @@ /// \todo This should be generated by TableGen. //===----------------------------------------------------------------------===// -#ifndef LLVM_BUILD_GLOBAL_ISEL -#error "You shouldn't build this" -#endif - namespace llvm { namespace AMDGPU { diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index f235313e4853..f4776adb069c 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -13,10 +13,12 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUArgumentUsageInfo.h" #include "AMDGPUISelLowering.h" // For AMDGPUISD #include "AMDGPUInstrInfo.h" #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" #include "SIDefines.h" #include "SIISelLowering.h" #include "SIInstrInfo.h" @@ -68,19 +70,30 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { // make the right decision when generating code for different targets. const AMDGPUSubtarget *Subtarget; AMDGPUAS AMDGPUASI; + bool EnableLateStructurizeCFG; public: - explicit AMDGPUDAGToDAGISel(TargetMachine &TM, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(TM, OptLevel){ - AMDGPUASI = AMDGPU::getAMDGPUAS(TM); + explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr, + CodeGenOpt::Level OptLevel = CodeGenOpt::Default) + : SelectionDAGISel(*TM, OptLevel) { + AMDGPUASI = AMDGPU::getAMDGPUAS(*TM); + EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG; } ~AMDGPUDAGToDAGISel() override = default; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + SelectionDAGISel::getAnalysisUsage(AU); + } + bool runOnMachineFunction(MachineFunction &MF) override; void Select(SDNode *N) override; StringRef getPassName() const override; void PostprocessISelDAG() override; +protected: + void SelectBuildVector(SDNode *N, unsigned RegClassID); + private: std::pair foldFrameIndex(SDValue N) const; bool isNoNanSrc(SDValue N) const; @@ -99,8 +112,8 @@ private: bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, SDValue& Offset); - bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); - bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); + virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); + virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); bool isDSOffsetLegal(const SDValue &Base, unsigned Offset, unsigned OffsetBits) const; bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; @@ -116,10 +129,10 @@ private: bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &SLC) const; - bool SelectMUBUFScratchOffen(SDNode *Root, + bool SelectMUBUFScratchOffen(SDNode *Parent, SDValue Addr, SDValue &RSrc, SDValue &VAddr, SDValue &SOffset, SDValue &ImmOffset) const; - bool SelectMUBUFScratchOffset(SDNode *Root, + bool SelectMUBUFScratchOffset(SDNode *Parent, SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset) const; @@ -140,6 +153,10 @@ private: bool SelectFlatAtomic(SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const; + bool SelectFlatAtomicSigned(SDValue Addr, SDValue &VAddr, + SDValue &Offset, SDValue &SLC) const; + + template bool SelectFlatOffset(SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const; @@ -152,10 +169,10 @@ private: bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const; bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; - bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const; bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, @@ -174,9 +191,22 @@ private: bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp) const; + bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3OpSel0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp) const; + + bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3OpSelMods0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp) const; + bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const; + bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + + bool SelectHi16Elt(SDValue In, SDValue &Src) const; + void SelectADD_SUB_I64(SDNode *N); void SelectUADDO_USUBO(SDNode *N); void SelectDIV_SCALE(SDNode *N); + void SelectMAD_64_32(SDNode *N); void SelectFMA_W_CHAIN(SDNode *N); void SelectFMUL_W_CHAIN(SDNode *N); @@ -186,21 +216,49 @@ private: void SelectS_BFE(SDNode *N); bool isCBranchSCC(const SDNode *N) const; void SelectBRCOND(SDNode *N); + void SelectFMAD(SDNode *N); void SelectATOMIC_CMP_SWAP(SDNode *N); +protected: // Include the pieces autogenerated from the target description. #include "AMDGPUGenDAGISel.inc" }; +class R600DAGToDAGISel : public AMDGPUDAGToDAGISel { +public: + explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) : + AMDGPUDAGToDAGISel(TM, OptLevel) {} + + void Select(SDNode *N) override; + + bool SelectADDRIndirect(SDValue Addr, SDValue &Base, + SDValue &Offset) override; + bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, + SDValue &Offset) override; +}; + } // end anonymous namespace +INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "isel", + "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) +INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo) +INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "isel", + "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) + /// \brief This pass converts a legalized DAG into a AMDGPU-specific // DAG, ready for instruction scheduling. -FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM, +FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM, CodeGenOpt::Level OptLevel) { return new AMDGPUDAGToDAGISel(TM, OptLevel); } +/// \brief This pass converts a legalized DAG into a R600-specific +// DAG, ready for instruction scheduling. +FunctionPass *llvm::createR600ISelDag(TargetMachine *TM, + CodeGenOpt::Level OptLevel) { + return new R600DAGToDAGISel(TM, OptLevel); +} + bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { Subtarget = &MF.getSubtarget(); return SelectionDAGISel::runOnMachineFunction(MF); @@ -279,8 +337,8 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, } SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || - cast(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS) + if (cast(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS || + !Subtarget->ldsRequiresM0Init()) return N; const SITargetLowering& Lowering = @@ -298,9 +356,7 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { Ops.push_back(N->getOperand(i)); } Ops.push_back(Glue); - CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); - - return N; + return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); } static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { @@ -334,6 +390,58 @@ static bool getConstantValue(SDValue N, uint32_t &Out) { return false; } +void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { + EVT VT = N->getValueType(0); + unsigned NumVectorElts = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo(); + SDLoc DL(N); + SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); + + if (NumVectorElts == 1) { + CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0), + RegClass); + return; + } + + assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not " + "supported yet"); + // 16 = Max Num Vector Elements + // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) + // 1 = Vector Register Class + SmallVector RegSeqArgs(NumVectorElts * 2 + 1); + + RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); + bool IsRegSeq = true; + unsigned NOps = N->getNumOperands(); + for (unsigned i = 0; i < NOps; i++) { + // XXX: Why is this here? + if (isa(N->getOperand(i))) { + IsRegSeq = false; + break; + } + RegSeqArgs[1 + (2 * i)] = N->getOperand(i); + RegSeqArgs[1 + (2 * i) + 1] = + CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, + MVT::i32); + } + if (NOps != NumVectorElts) { + // Fill in the missing undef elements if this was a scalar_to_vector. + assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts); + MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + DL, EltVT); + for (unsigned i = NOps; i < NumVectorElts; ++i) { + RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); + RegSeqArgs[1 + (2 * i) + 1] = + CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32); + } + } + + if (!IsRegSeq) + SelectCode(N); + CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs); +} + void AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { @@ -346,18 +454,16 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { N = glueCopyToM0(N); switch (Opc) { - default: break; + default: + break; // We are selecting i64 ADD here instead of custom lower it during // DAG legalization, so we can fold some i64 ADDs used for address // calculation into the LOAD and STORE instructions. - case ISD::ADD: case ISD::ADDC: case ISD::ADDE: - case ISD::SUB: case ISD::SUBC: case ISD::SUBE: { - if (N->getValueType(0) != MVT::i64 || - Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + if (N->getValueType(0) != MVT::i64) break; SelectADD_SUB_I64(N); @@ -378,13 +484,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { } case ISD::SCALAR_TO_VECTOR: - case AMDGPUISD::BUILD_VERTICAL_VECTOR: case ISD::BUILD_VECTOR: { - unsigned RegClassID; - const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo(); EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); - EVT EltVT = VT.getVectorElementType(); if (VT == MVT::v2i16 || VT == MVT::v2f16) { if (Opc == ISD::BUILD_VECTOR) { @@ -401,81 +503,13 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { break; } - assert(EltVT.bitsEq(MVT::i32)); - - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - RegClassID = selectSGPRVectorRegClassID(NumVectorElts); - } else { - // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG - // that adds a 128 bits reg copy when going through TwoAddressInstructions - // pass. We want to avoid 128 bits copies as much as possible because they - // can't be bundled by our scheduler. - switch(NumVectorElts) { - case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break; - case 4: - if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR) - RegClassID = AMDGPU::R600_Reg128VerticalRegClassID; - else - RegClassID = AMDGPU::R600_Reg128RegClassID; - break; - default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); - } - } - - SDLoc DL(N); - SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); - - if (NumVectorElts == 1) { - CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0), - RegClass); - return; - } - - assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not " - "supported yet"); - // 16 = Max Num Vector Elements - // 2 = 2 REG_SEQUENCE operands per element (value, subreg index) - // 1 = Vector Register Class - SmallVector RegSeqArgs(NumVectorElts * 2 + 1); - - RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); - bool IsRegSeq = true; - unsigned NOps = N->getNumOperands(); - for (unsigned i = 0; i < NOps; i++) { - // XXX: Why is this here? - if (isa(N->getOperand(i))) { - IsRegSeq = false; - break; - } - RegSeqArgs[1 + (2 * i)] = N->getOperand(i); - RegSeqArgs[1 + (2 * i) + 1] = - CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, - MVT::i32); - } - - if (NOps != NumVectorElts) { - // Fill in the missing undef elements if this was a scalar_to_vector. - assert(Opc == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts); - - MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, - DL, EltVT); - for (unsigned i = NOps; i < NumVectorElts; ++i) { - RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); - RegSeqArgs[1 + (2 * i) + 1] = - CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32); - } - } - - if (!IsRegSeq) - break; - CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs); + assert(VT.getVectorElementType().bitsEq(MVT::i32)); + unsigned RegClassID = selectSGPRVectorRegClassID(NumVectorElts); + SelectBuildVector(N, RegClassID); return; } case ISD::BUILD_PAIR: { SDValue RC, SubReg0, SubReg1; - if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - break; - } SDLoc DL(N); if (N->getValueType(0) == MVT::i128) { RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32); @@ -497,8 +531,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::Constant: case ISD::ConstantFP: { - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || - N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N)) + if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N)) break; uint64_t Imm; @@ -533,9 +566,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) - break; - // There is a scalar version available, but unlike the vector version which // has a separate operand for the offset and width, the scalar version packs // the width and offset into a single operand. Try to move to the scalar @@ -565,6 +595,11 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectDIV_SCALE(N); return; } + case AMDGPUISD::MAD_I64_I32: + case AMDGPUISD::MAD_U64_U32: { + SelectMAD_64_32(N); + return; + } case ISD::CopyToReg: { const SITargetLowering& Lowering = *static_cast(getTargetLowering()); @@ -575,8 +610,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::SRL: case ISD::SRA: case ISD::SIGN_EXTEND_INREG: - if (N->getValueType(0) != MVT::i32 || - Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) + if (N->getValueType(0) != MVT::i32) break; SelectS_BFE(N); @@ -584,7 +618,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::BRCOND: SelectBRCOND(N); return; - + case ISD::FMAD: + SelectFMAD(N); + return; case AMDGPUISD::ATOMIC_CMP_SWAP: SelectATOMIC_CMP_SWAP(N); return; @@ -638,32 +674,8 @@ bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, } bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, - SDValue &Offset) { - ConstantSDNode *IMMOffset; - - if (Addr.getOpcode() == ISD::ADD - && (IMMOffset = dyn_cast(Addr.getOperand(1))) - && isInt<16>(IMMOffset->getZExtValue())) { - - Base = Addr.getOperand(0); - Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), - MVT::i32); - return true; - // If the pointer address is constant, we can move it to the offset field. - } else if ((IMMOffset = dyn_cast(Addr)) - && isInt<16>(IMMOffset->getZExtValue())) { - Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), - SDLoc(CurDAG->getEntryNode()), - AMDGPU::ZERO, MVT::i32); - Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), - MVT::i32); - return true; - } - - // Default case, no offset - Base = Addr; - Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); - return true; + SDValue &Offset) { + return false; } bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, @@ -690,6 +702,7 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, return true; } +// FIXME: Should only handle addcarry/subcarry void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { SDLoc DL(N); SDValue LHS = N->getOperand(0); @@ -699,8 +712,7 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE); bool ProduceCarry = ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC; - bool IsAdd = - (Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE); + bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE; SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); @@ -782,7 +794,7 @@ void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) { void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) { SDLoc SL(N); - // src0_modifiers, src0, src1_modifiers, src1, clamp, omod + // src0_modifiers, src0, src1_modifiers, src1, clamp, omod SDValue Ops[8]; SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]); @@ -808,6 +820,19 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } +// We need to handle this here because tablegen doesn't support matching +// instructions with multiple outputs. +void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { + SDLoc SL(N); + bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32; + unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32 : AMDGPU::V_MAD_U64_U32; + + SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + Clamp }; + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); +} + bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, unsigned OffsetBits) const { if ((OffsetBits == 16 && !isUInt<16>(Offset)) || @@ -850,8 +875,12 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, Zero, Addr.getOperand(1)); if (isDSOffsetLegal(Sub, ByteOffset, 16)) { + // FIXME: Select to VOP3 version for with-carry. + unsigned SubOp = Subtarget->hasAddNoCarry() ? + AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; + MachineSDNode *MachineSub - = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32, + = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Zero, Addr.getOperand(1)); Base = SDValue(MachineSub, 0); @@ -920,8 +949,11 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, Zero, Addr.getOperand(1)); if (isDSOffsetLegal(Sub, DWordOffset1, 8)) { + unsigned SubOp = Subtarget->hasAddNoCarry() ? + AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32; + MachineSDNode *MachineSub - = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32, + = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Zero, Addr.getOperand(1)); Base = SDValue(MachineSub, 0); @@ -958,14 +990,6 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, return true; } -static bool isLegalMUBUFImmOffset(unsigned Imm) { - return isUInt<12>(Imm); -} - -static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) { - return isLegalMUBUFImmOffset(Imm->getZExtValue()); -} - bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, @@ -1007,7 +1031,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, Ptr = N0; } - if (isLegalMUBUFImmOffset(C1)) { + if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) { Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); return true; } @@ -1104,7 +1128,7 @@ std::pair AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const MVT::i32)); } -bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root, +bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, SDValue Addr, SDValue &Rsrc, SDValue &VAddr, SDValue &SOffset, SDValue &ImmOffset) const { @@ -1117,8 +1141,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root, if (ConstantSDNode *CAddr = dyn_cast(Addr)) { unsigned Imm = CAddr->getZExtValue(); - assert(!isLegalMUBUFImmOffset(Imm) && - "should have been selected by other pattern"); SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32); MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, @@ -1127,7 +1149,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root, // In a call sequence, stores to the argument stack area are relative to the // stack pointer. - const MachinePointerInfo &PtrInfo = cast(Root)->getPointerInfo(); + const MachinePointerInfo &PtrInfo = cast(Parent)->getPointerInfo(); unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); @@ -1142,9 +1164,25 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root, SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); - // Offsets in vaddr must be positive. + // Offsets in vaddr must be positive if range checking is enabled. + // + // The total computation of vaddr + soffset + offset must not overflow. If + // vaddr is negative, even if offset is 0 the sgpr offset add will end up + // overflowing. + // + // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would + // always perform a range check. If a negative vaddr base index was used, + // this would fail the range check. The overall address computation would + // compute a valid address, but this doesn't happen due to the range + // check. For out-of-bounds MUBUF loads, a 0 is returned. + // + // Therefore it should be safe to fold any VGPR offset on gfx9 into the + // MUBUF vaddr, but not on older subtargets which can only do this if the + // sign bit is known 0. ConstantSDNode *C1 = cast(N1); - if (isLegalMUBUFImmOffset(C1)) { + if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) && + (!Subtarget->privateMemoryResourceIsRangeChecked() || + CurDAG->SignBitIsZero(N0))) { std::tie(VAddr, SOffset) = foldFrameIndex(N0); ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); return true; @@ -1157,13 +1195,13 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root, return true; } -bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Root, +bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset) const { ConstantSDNode *CAddr = dyn_cast(Addr); - if (!CAddr || !isLegalMUBUFImmOffset(CAddr)) + if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) return false; SDLoc DL(Addr); @@ -1172,7 +1210,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Root, SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); - const MachinePointerInfo &PtrInfo = cast(Root)->getPointerInfo(); + const MachinePointerInfo &PtrInfo = cast(Parent)->getPointerInfo(); unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); @@ -1231,24 +1269,30 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant, SDValue &SOffset, SDValue &ImmOffset) const { SDLoc DL(Constant); + const uint32_t Align = 4; + const uint32_t MaxImm = alignDown(4095, Align); uint32_t Imm = cast(Constant)->getZExtValue(); uint32_t Overflow = 0; - if (Imm >= 4096) { - if (Imm <= 4095 + 64) { - // Use an SOffset inline constant for 1..64 - Overflow = Imm - 4095; - Imm = 4095; + if (Imm > MaxImm) { + if (Imm <= MaxImm + 64) { + // Use an SOffset inline constant for 4..64 + Overflow = Imm - MaxImm; + Imm = MaxImm; } else { // Try to keep the same value in SOffset for adjacent loads, so that // the corresponding register contents can be re-used. // - // Load values with all low-bits set into SOffset, so that a larger - // range of values can be covered using s_movk_i32 - uint32_t High = (Imm + 1) & ~4095; - uint32_t Low = (Imm + 1) & 4095; + // Load values with all low-bits (except for alignment bits) set into + // SOffset, so that a larger range of values can be covered using + // s_movk_i32. + // + // Atomic operations fail to work correctly when individual address + // components are unaligned, even if their sum is aligned. + uint32_t High = (Imm + Align) & ~4095; + uint32_t Low = (Imm + Align) & 4095; Imm = Low; - Overflow = High - 1; + Overflow = High - Align; } } @@ -1316,6 +1360,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset, return true; } +template bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr, SDValue &VAddr, SDValue &Offset, @@ -1326,8 +1371,10 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr, CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); - uint64_t COffsetVal = cast(N1)->getZExtValue(); - if (isUInt<12>(COffsetVal)) { + int64_t COffsetVal = cast(N1)->getSExtValue(); + + if ((IsSigned && isInt<13>(COffsetVal)) || + (!IsSigned && isUInt<12>(COffsetVal))) { Addr = N0; OffsetVal = COffsetVal; } @@ -1344,7 +1391,14 @@ bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const { - return SelectFlatOffset(Addr, VAddr, Offset, SLC); + return SelectFlatOffset(Addr, VAddr, Offset, SLC); +} + +bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDValue Addr, + SDValue &VAddr, + SDValue &Offset, + SDValue &SLC) const { + return SelectFlatOffset(Addr, VAddr, Offset, SLC); } bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, @@ -1443,13 +1497,6 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr, return !Imm && isa(Offset); } -bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr, - SDValue &Offset) const { - bool Imm; - return SelectSMRDOffset(Addr, Offset, Imm) && !Imm && - !isa(Offset); -} - bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const { @@ -1622,18 +1669,55 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { return; } - if (isCBranchSCC(N)) { - // This brcond will use S_CBRANCH_SCC*, so let tablegen handle it. + bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N); + unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ; + unsigned CondReg = UseSCCBr ? AMDGPU::SCC : AMDGPU::VCC; + SDLoc SL(N); + + SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond); + CurDAG->SelectNodeTo(N, BrOp, MVT::Other, + N->getOperand(2), // Basic Block + VCC.getValue(0)); +} + +void AMDGPUDAGToDAGISel::SelectFMAD(SDNode *N) { + MVT VT = N->getSimpleValueType(0); + if (VT != MVT::f32 || !Subtarget->hasMadMixInsts()) { SelectCode(N); return; } - SDLoc SL(N); + SDValue Src0 = N->getOperand(0); + SDValue Src1 = N->getOperand(1); + SDValue Src2 = N->getOperand(2); + unsigned Src0Mods, Src1Mods, Src2Mods; + + // Avoid using v_mad_mix_f32 unless there is actually an operand using the + // conversion from f16. + bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods); + bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods); + bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods); + + assert(!Subtarget->hasFP32Denormals() && + "fmad selected with denormals enabled"); + // TODO: We can select this with f32 denormals enabled if all the sources are + // converted from f16 (in which case fmad isn't legal). + + if (Sel0 || Sel1 || Sel2) { + // For dummy operands. + SDValue Zero = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); + SDValue Ops[] = { + CurDAG->getTargetConstant(Src0Mods, SDLoc(), MVT::i32), Src0, + CurDAG->getTargetConstant(Src1Mods, SDLoc(), MVT::i32), Src1, + CurDAG->getTargetConstant(Src2Mods, SDLoc(), MVT::i32), Src2, + CurDAG->getTargetConstant(0, SDLoc(), MVT::i1), + Zero, Zero + }; - SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, AMDGPU::VCC, Cond); - CurDAG->SelectNodeTo(N, AMDGPU::S_CBRANCH_VCCNZ, MVT::Other, - N->getOperand(2), // Basic Block - VCC.getValue(0)); + CurDAG->SelectNodeTo(N, AMDGPU::V_MAD_MIX_F32, MVT::f32, Ops); + } else { + SelectCode(N); + } } // This is here because there isn't a way to use the generated sub0_sub1 as the @@ -1652,11 +1736,11 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { MachineSDNode *CmpSwap = nullptr; if (Subtarget->hasAddr64()) { - SDValue SRsrc, VAddr, SOffset, Offset, GLC, SLC; + SDValue SRsrc, VAddr, SOffset, Offset, SLC; if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) { - unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_ADDR64 : - AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_ADDR64; + unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN : + AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN; SDValue CmpVal = Mem->getOperand(2); // XXX - Do we care about glue operands? @@ -1672,8 +1756,8 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { if (!CmpSwap) { SDValue SRsrc, SOffset, Offset, SLC; if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) { - unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET : - AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_OFFSET; + unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN : + AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN; SDValue CmpVal = Mem->getOperand(2); SDValue Ops[] = { @@ -1702,9 +1786,9 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { CurDAG->RemoveDeadNode(N); } -bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, - SDValue &SrcMods) const { - unsigned Mods = 0; +bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, + unsigned &Mods) const { + Mods = 0; Src = In; if (Src.getOpcode() == ISD::FNEG) { @@ -1717,10 +1801,20 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, Src = Src.getOperand(0); } - SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; } +bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods; + if (SelectVOP3ModsImpl(In, Src, Mods)) { + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; + } + + return false; +} + bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const { SelectVOP3Mods(In, Src, SrcMods); @@ -1864,24 +1958,234 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src, return SelectVOP3PMods(In, Src, SrcMods); } +bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + Src = In; + // FIXME: Handle op_sel + SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3OpSel0(SDValue In, SDValue &Src, + SDValue &SrcMods, + SDValue &Clamp) const { + SDLoc SL(In); + + // FIXME: Handle clamp + Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); + + return SelectVOP3OpSel(In, Src, SrcMods); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + // FIXME: Handle op_sel + return SelectVOP3Mods(In, Src, SrcMods); +} + +bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods0(SDValue In, SDValue &Src, + SDValue &SrcMods, + SDValue &Clamp) const { + SDLoc SL(In); + + // FIXME: Handle clamp + Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32); + + return SelectVOP3OpSelMods(In, Src, SrcMods); +} + +// The return value is not whether the match is possible (which it always is), +// but whether or not it a conversion is really used. +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, + unsigned &Mods) const { + Mods = 0; + SelectVOP3ModsImpl(In, Src, Mods); + + if (Src.getOpcode() == ISD::FP_EXTEND) { + Src = Src.getOperand(0); + assert(Src.getValueType() == MVT::f16); + Src = stripBitcast(Src); + + // Be careful about folding modifiers if we already have an abs. fneg is + // applied last, so we don't want to apply an earlier fneg. + if ((Mods & SISrcMods::ABS) == 0) { + unsigned ModsTmp; + SelectVOP3ModsImpl(Src, Src, ModsTmp); + + if ((ModsTmp & SISrcMods::NEG) != 0) + Mods ^= SISrcMods::NEG; + + if ((ModsTmp & SISrcMods::ABS) != 0) + Mods |= SISrcMods::ABS; + } + + // op_sel/op_sel_hi decide the source type and source. + // If the source's op_sel_hi is set, it indicates to do a conversion from fp16. + // If the sources's op_sel is set, it picks the high half of the source + // register. + + Mods |= SISrcMods::OP_SEL_1; + if (isExtractHiElt(Src, Src)) { + Mods |= SISrcMods::OP_SEL_0; + + // TODO: Should we try to look for neg/abs here? + } + + return true; + } + + return false; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + SelectVOP3PMadMixModsImpl(In, Src, Mods); + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +// TODO: Can we identify things like v_mad_mixhi_f16? +bool AMDGPUDAGToDAGISel::SelectHi16Elt(SDValue In, SDValue &Src) const { + if (In.isUndef()) { + Src = In; + return true; + } + + if (ConstantSDNode *C = dyn_cast(In)) { + SDLoc SL(In); + SDValue K = CurDAG->getTargetConstant(C->getZExtValue() << 16, SL, MVT::i32); + MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + SL, MVT::i32, K); + Src = SDValue(MovK, 0); + return true; + } + + if (ConstantFPSDNode *C = dyn_cast(In)) { + SDLoc SL(In); + SDValue K = CurDAG->getTargetConstant( + C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32); + MachineSDNode *MovK = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + SL, MVT::i32, K); + Src = SDValue(MovK, 0); + return true; + } + + return isExtractHiElt(In, Src); +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast(getTargetLowering()); bool IsModified = false; do { IsModified = false; + // Go over all selected nodes and try to fold them a bit more - for (SDNode &Node : CurDAG->allnodes()) { - MachineSDNode *MachineNode = dyn_cast(&Node); + SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin(); + while (Position != CurDAG->allnodes_end()) { + SDNode *Node = &*Position++; + MachineSDNode *MachineNode = dyn_cast(Node); if (!MachineNode) continue; SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG); - if (ResNode != &Node) { - ReplaceUses(&Node, ResNode); + if (ResNode != Node) { + if (ResNode) + ReplaceUses(Node, ResNode); IsModified = true; } } CurDAG->RemoveDeadNodes(); } while (IsModified); } + +void R600DAGToDAGISel::Select(SDNode *N) { + unsigned int Opc = N->getOpcode(); + if (N->isMachineOpcode()) { + N->setNodeId(-1); + return; // Already selected. + } + + switch (Opc) { + default: break; + case AMDGPUISD::BUILD_VERTICAL_VECTOR: + case ISD::SCALAR_TO_VECTOR: + case ISD::BUILD_VECTOR: { + EVT VT = N->getValueType(0); + unsigned NumVectorElts = VT.getVectorNumElements(); + unsigned RegClassID; + // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG + // that adds a 128 bits reg copy when going through TwoAddressInstructions + // pass. We want to avoid 128 bits copies as much as possible because they + // can't be bundled by our scheduler. + switch(NumVectorElts) { + case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break; + case 4: + if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR) + RegClassID = AMDGPU::R600_Reg128VerticalRegClassID; + else + RegClassID = AMDGPU::R600_Reg128RegClassID; + break; + default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); + } + SelectBuildVector(N, RegClassID); + return; + } + } + + SelectCode(N); +} + +bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, + SDValue &Offset) { + ConstantSDNode *C; + SDLoc DL(Addr); + + if ((C = dyn_cast(Addr))) { + Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); + } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) && + (C = dyn_cast(Addr.getOperand(0)))) { + Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); + } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && + (C = dyn_cast(Addr.getOperand(1)))) { + Base = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); + } else { + Base = Addr; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); + } + + return true; +} + +bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, + SDValue &Offset) { + ConstantSDNode *IMMOffset; + + if (Addr.getOpcode() == ISD::ADD + && (IMMOffset = dyn_cast(Addr.getOperand(1))) + && isInt<16>(IMMOffset->getZExtValue())) { + + Base = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), + MVT::i32); + return true; + // If the pointer address is constant, we can move it to the offset field. + } else if ((IMMOffset = dyn_cast(Addr)) + && isInt<16>(IMMOffset->getZExtValue())) { + Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), + SDLoc(CurDAG->getEntryNode()), + AMDGPU::ZERO, MVT::i32); + Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), + MVT::i32); + return true; + } + + // Default case, no offset + Base = Addr; + Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); + return true; +} diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 258b1737deb3..49929441ef21 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -13,6 +13,10 @@ // //===----------------------------------------------------------------------===// +#define AMDGPU_LOG2E_F 1.44269504088896340735992468100189214f +#define AMDGPU_LN2_F 0.693147180559945309417232121458176568f +#define AMDGPU_LN10_F 2.30258509299404568401799145468436421f + #include "AMDGPUISelLowering.h" #include "AMDGPU.h" #include "AMDGPUCallLowering.h" @@ -20,6 +24,7 @@ #include "AMDGPUIntrinsicInfo.h" #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" #include "R600MachineFunctionInfo.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" @@ -127,27 +132,20 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); } -bool AMDGPUTargetLowering::isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op) -{ - assert(Op.getOpcode() == ISD::OR); - - SDValue N0 = Op->getOperand(0); - SDValue N1 = Op->getOperand(1); - EVT VT = N0.getValueType(); - - if (VT.isInteger() && !VT.isVector()) { - KnownBits LHSKnown, RHSKnown; - DAG.computeKnownBits(N0, LHSKnown); +unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) { + KnownBits Known; + EVT VT = Op.getValueType(); + DAG.computeKnownBits(Op, Known); - if (LHSKnown.Zero.getBoolValue()) { - DAG.computeKnownBits(N1, RHSKnown); + return VT.getSizeInBits() - Known.countMinLeadingZeros(); +} - if (!(~RHSKnown.Zero & ~LHSKnown.Zero)) - return true; - } - } +unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); - return false; + // In order for this to be a signed 24-bit value, bit 23, must + // be a sign bit. + return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op); } AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, @@ -323,6 +321,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FROUND, MVT::f32, Custom); setOperationAction(ISD::FROUND, MVT::f64, Custom); + setOperationAction(ISD::FLOG, MVT::f32, Custom); + setOperationAction(ISD::FLOG10, MVT::f32, Custom); + + if (Subtarget->has16BitInsts()) { + setOperationAction(ISD::FLOG, MVT::f16, Custom); + setOperationAction(ISD::FLOG10, MVT::f16, Custom); + } + setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); @@ -399,8 +405,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::MUL, MVT::i64, Expand); setOperationAction(ISD::MULHU, MVT::i64, Expand); setOperationAction(ISD::MULHS, MVT::i64, Expand); - setOperationAction(ISD::UDIV, MVT::i32, Expand); - setOperationAction(ISD::UREM, MVT::i32, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); @@ -416,8 +420,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); if (Subtarget->hasFFBL()) - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom); + setOperationAction(ISD::CTTZ, MVT::i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom); setOperationAction(ISD::CTLZ, MVT::i64, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); @@ -475,6 +481,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); + setOperationAction(ISD::SETCC, VT, Expand); } static const MVT::SimpleValueType FloatVectorTypes[] = { @@ -492,6 +499,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FEXP2, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::FLOG, VT, Expand); + setOperationAction(ISD::FLOG10, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); @@ -507,6 +516,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); + setOperationAction(ISD::SETCC, VT, Expand); } // This causes using an unrolled select operation rather than expansion with @@ -822,6 +832,17 @@ bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return isZExtFree(Val.getValueType(), VT2); } +// v_mad_mix* support a conversion from f16 to f32. +// +// There is only one special case when denormals are enabled we don't currently, +// where this is OK to use. +bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode, + EVT DestVT, EVT SrcVT) const { + return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() && + DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() && + SrcVT.getScalarType() == MVT::f16; +} + bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { // There aren't really 64-bit registers, but pairs of 32-bit ones and only a // limited number of native 64-bit operations. Shrinking an operation to fit @@ -847,9 +868,12 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_ES: + case CallingConv::AMDGPU_LS: return CC_AMDGPU; case CallingConv::C: case CallingConv::Fast: + case CallingConv::Cold: return CC_AMDGPU_Func; default: report_fatal_error("Unsupported calling convention."); @@ -867,9 +891,12 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_ES: + case CallingConv::AMDGPU_LS: return RetCC_SI_Shader; case CallingConv::C: case CallingConv::Fast: + case CallingConv::Cold: return RetCC_AMDGPU_Func; default: report_fatal_error("Unsupported calling convention."); @@ -1000,12 +1027,49 @@ CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg); } -SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, - SmallVectorImpl &InVals) const { +SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain, + SelectionDAG &DAG, + MachineFrameInfo &MFI, + int ClobberedFI) const { + SmallVector ArgChains; + int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); + int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; + + // Include the original chain at the beginning of the list. When this is + // used by target LowerCall hooks, this helps legalize find the + // CALLSEQ_BEGIN node. + ArgChains.push_back(Chain); + + // Add a chain value for each stack argument corresponding + for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), + UE = DAG.getEntryNode().getNode()->use_end(); + U != UE; ++U) { + if (LoadSDNode *L = dyn_cast(*U)) { + if (FrameIndexSDNode *FI = dyn_cast(L->getBasePtr())) { + if (FI->getIndex() < 0) { + int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); + int64_t InLastByte = InFirstByte; + InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; + + if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || + (FirstByte <= InFirstByte && InFirstByte <= LastByte)) + ArgChains.push_back(SDValue(L, 1)); + } + } + } + } + + // Build a tokenfactor for all the chains. + return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); +} + +SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI, + SmallVectorImpl &InVals, + StringRef Reason) const { SDValue Callee = CLI.Callee; SelectionDAG &DAG = CLI.DAG; - const Function &Fn = *DAG.getMachineFunction().getFunction(); + const Function &Fn = DAG.getMachineFunction().getFunction(); StringRef FuncName(""); @@ -1015,7 +1079,7 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, FuncName = G->getGlobal()->getName(); DiagnosticInfoUnsupported NoCalls( - Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc()); + Fn, Reason + FuncName, CLI.DL.getDebugLoc()); DAG.getContext()->diagnose(NoCalls); if (!CLI.IsTailCall) { @@ -1026,9 +1090,14 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, return DAG.getEntryNode(); } +SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl &InVals) const { + return lowerUnhandledCall(CLI, InVals, "unsupported call to function "); +} + SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { - const Function &Fn = *DAG.getMachineFunction().getFunction(); + const Function &Fn = DAG.getMachineFunction().getFunction(); DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()); @@ -1057,14 +1126,20 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); + case ISD::FLOG: + return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F); + case ISD::FLOG10: + return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::CTTZ: + case ISD::CTTZ_ZERO_UNDEF: case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: - return LowerCTLZ(Op, DAG); + return LowerCTLZ_CTTZ(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); } return Op; @@ -1115,7 +1190,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, } } - const Function &Fn = *DAG.getMachineFunction().getFunction(); + const Function &Fn = DAG.getMachineFunction().getFunction(); DiagnosticInfoUnsupported BadInit( Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc()); DAG.getContext()->diagnose(BadInit); @@ -1261,7 +1336,6 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, return scalarizeVectorLoad(Load, DAG); SDValue BasePtr = Load->getBasePtr(); - EVT PtrVT = BasePtr.getValueType(); EVT MemVT = Load->getMemoryVT(); SDLoc SL(Op); @@ -1282,8 +1356,7 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue, LoMemVT, BaseAlign, Load->getMemOperand()->getFlags()); - SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(Size, SL, PtrVT)); + SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size); SDValue HiLoad = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), @@ -1322,10 +1395,7 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT); - EVT PtrVT = BasePtr.getValueType(); - SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(LoMemVT.getStoreSize(), SL, - PtrVT)); + SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize()); const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo(); unsigned BaseAlign = Store->getAlignment(); @@ -1454,49 +1524,181 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl &Results) const { - assert(Op.getValueType() == MVT::i64); - SDLoc DL(Op); EVT VT = Op.getValueType(); + + assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64"); + EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); - SDValue one = DAG.getConstant(1, DL, HalfVT); - SDValue zero = DAG.getConstant(0, DL, HalfVT); + SDValue One = DAG.getConstant(1, DL, HalfVT); + SDValue Zero = DAG.getConstant(0, DL, HalfVT); //HiLo split SDValue LHS = Op.getOperand(0); - SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); - SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); + SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero); + SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One); SDValue RHS = Op.getOperand(1); - SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); - SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); + SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero); + SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One); - if (VT == MVT::i64 && - DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && - DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { + if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) && + DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) { SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), LHS_Lo, RHS_Lo); - SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero}); - SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero}); + SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero}); + SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero}); Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV)); Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM)); return; } + if (isTypeLegal(MVT::i64)) { + // Compute denominator reciprocal. + unsigned FMAD = Subtarget->hasFP32Denormals() ? + (unsigned)AMDGPUISD::FMAD_FTZ : + (unsigned)ISD::FMAD; + + SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo); + SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi); + SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi, + DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32), + Cvt_Lo); + SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1); + SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp, + DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32)); + SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1, + DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32)); + SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2); + SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc, + DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32), + Mul1); + SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2); + SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc); + SDValue Rcp64 = DAG.getBitcast(VT, + DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi})); + + SDValue Zero64 = DAG.getConstant(0, DL, VT); + SDValue One64 = DAG.getConstant(1, DL, VT); + SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1); + SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1); + + SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS); + SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64); + SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1); + SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, + Zero); + SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, + One); + + SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo, + Mulhi1_Lo, Zero1); + SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi, + Mulhi1_Hi, Add1_Lo.getValue(1)); + SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi); + SDValue Add1 = DAG.getBitcast(VT, + DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi})); + + SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1); + SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2); + SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, + Zero); + SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, + One); + + SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo, + Mulhi2_Lo, Zero1); + SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc, + Mulhi2_Hi, Add1_Lo.getValue(1)); + SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC, + Zero, Add2_Lo.getValue(1)); + SDValue Add2 = DAG.getBitcast(VT, + DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi})); + SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2); + + SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3); + + SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero); + SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One); + SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo, + Mul3_Lo, Zero1); + SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi, + Mul3_Hi, Sub1_Lo.getValue(1)); + SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi); + SDValue Sub1 = DAG.getBitcast(VT, + DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi})); + + SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT); + SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero, + ISD::SETUGE); + SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero, + ISD::SETUGE); + SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ); + + // TODO: Here and below portions of the code can be enclosed into if/endif. + // Currently control flow is unconditional and we have 4 selects after + // potential endif to substitute PHIs. + + // if C3 != 0 ... + SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo, + RHS_Lo, Zero1); + SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi, + RHS_Hi, Sub1_Lo.getValue(1)); + SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi, + Zero, Sub2_Lo.getValue(1)); + SDValue Sub2 = DAG.getBitcast(VT, + DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi})); + + SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64); + + SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero, + ISD::SETUGE); + SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero, + ISD::SETUGE); + SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ); + + // if (C6 != 0) + SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64); + + SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo, + RHS_Lo, Zero1); + SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi, + RHS_Hi, Sub2_Lo.getValue(1)); + SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi, + Zero, Sub3_Lo.getValue(1)); + SDValue Sub3 = DAG.getBitcast(VT, + DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi})); + + // endif C6 + // endif C3 + + SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE); + SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE); + + SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE); + SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE); + + Results.push_back(Div); + Results.push_back(Rem); + + return; + } + + // r600 expandion. // Get Speculative values SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); - SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); - SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero}); + SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ); + SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero}); REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM); - SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); - SDValue DIV_Lo = zero; + SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ); + SDValue DIV_Lo = Zero; const unsigned halfBitWidth = HalfVT.getSizeInBits(); @@ -1505,7 +1707,7 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, SDValue POS = DAG.getConstant(bitPos, DL, HalfVT); // Get value of high bit SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); - HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); + HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One); HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit); // Shift @@ -1514,7 +1716,7 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit); SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT); - SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE); + SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE); DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); @@ -1971,13 +2173,45 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } -SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { +SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG, + double Log2BaseInverted) const { + EVT VT = Op.getValueType(); + + SDLoc SL(Op); + SDValue Operand = Op.getOperand(0); + SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand); + SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT); + + return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand); +} + +static bool isCtlzOpc(unsigned Opc) { + return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; +} + +static bool isCttzOpc(unsigned Opc) { + return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF; +} + +SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); - bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF; + bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF || + Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF; + + unsigned ISDOpc, NewOpc; + if (isCtlzOpc(Op.getOpcode())) { + ISDOpc = ISD::CTLZ_ZERO_UNDEF; + NewOpc = AMDGPUISD::FFBH_U32; + } else if (isCttzOpc(Op.getOpcode())) { + ISDOpc = ISD::CTTZ_ZERO_UNDEF; + NewOpc = AMDGPUISD::FFBL_B32; + } else + llvm_unreachable("Unexpected OPCode!!!"); + if (ZeroUndef && Src.getValueType() == MVT::i32) - return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src); + return DAG.getNode(NewOpc, SL, MVT::i32, Src); SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); @@ -1990,24 +2224,32 @@ SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); - SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ); + SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo; + SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ); - SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo); - SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi); + SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo); + SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi); const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32); - SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32); - - // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x)) - SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi); + SDValue Add, NewOpr; + if (isCtlzOpc(Op.getOpcode())) { + Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32); + // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x)) + NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi); + } else { + Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32); + // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x)) + NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo); + } if (!ZeroUndef) { // Test if the full 64-bit input is zero. // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32, // which we probably don't want. - SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ); - SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0); + SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi; + SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ); + SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0); // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction // with the same cycles, otherwise it is slower. @@ -2018,11 +2260,11 @@ SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { // The instruction returns -1 for 0 input, but the defined intrinsic // behavior is to return the number of bits. - NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, - SrcIsZero, Bits32, NewCtlz); + NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, + SrcIsZero, Bits32, NewOpr); } - return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz); + return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr); } SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, @@ -2389,21 +2631,14 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, //===----------------------------------------------------------------------===// static bool isU24(SDValue Op, SelectionDAG &DAG) { - KnownBits Known; - EVT VT = Op.getValueType(); - DAG.computeKnownBits(Op, Known); - - return (VT.getSizeInBits() - Known.countMinLeadingZeros()) <= 24; + return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24; } static bool isI24(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); - - // In order for this to be a signed 24-bit value, bit 23, must - // be a sign bit. return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated // as unsigned 24-bit values. - (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24; + AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24; } static bool simplifyI24(SDNode *Node24, unsigned OpIdx, @@ -2665,11 +2900,21 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, case ISD::ZERO_EXTEND: case ISD::SIGN_EXTEND: case ISD::ANY_EXTEND: { + SDValue X = LHS->getOperand(0); + + if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 && + isTypeLegal(MVT::v2i16)) { + // Prefer build_vector as the canonical form if packed types are legal. + // (shl ([asz]ext i16:x), 16 -> build_vector 0, x + SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL, + { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) }); + return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec); + } + // shl (ext x) => zext (shl x), if shift does not overflow int if (VT != MVT::i64) break; KnownBits Known; - SDValue X = LHS->getOperand(0); DAG.computeKnownBits(X, Known); unsigned LZ = Known.countMinLeadingZeros(); if (LZ < RHSVal) @@ -2678,21 +2923,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0)); return DAG.getZExtOrTrunc(Shl, SL, VT); } - case ISD::OR: - if (!isOrEquivalentToAdd(DAG, LHS)) - break; - LLVM_FALLTHROUGH; - case ISD::ADD: { - // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1) - if (ConstantSDNode *C2 = dyn_cast(LHS->getOperand(1))) { - SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0), - SDValue(RHS, 0)); - SDValue C2V = DAG.getConstant(C2->getAPIntValue() << RHSVal, - SDLoc(C2), VT); - return DAG.getNode(LHS->getOpcode(), SL, VT, Shl, C2V); - } - break; - } } if (VT != MVT::i64) @@ -2924,13 +3154,10 @@ static bool isNegativeOne(SDValue Val) { return false; } -static bool isCtlzOpc(unsigned Opc) { - return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF; -} - -SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG, +SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG, SDValue Op, - const SDLoc &DL) const { + const SDLoc &DL, + unsigned Opc) const { EVT VT = Op.getValueType(); EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT); if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() && @@ -2940,11 +3167,11 @@ SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG, if (VT != MVT::i32) Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op); - SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op); + SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op); if (VT != MVT::i32) - FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH); + FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX); - return FFBH; + return FFBX; } // The native instructions return -1 on 0 input. Optimize out a select that @@ -2954,7 +3181,7 @@ SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG, // against the bitwidth. // // TODO: Should probably combine against FFBH_U32 instead of ctlz directly. -SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond, +SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const { ConstantSDNode *CmpRhs = dyn_cast(Cond.getOperand(1)); @@ -2965,20 +3192,25 @@ SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond, ISD::CondCode CCOpcode = cast(Cond.getOperand(2))->get(); SDValue CmpLHS = Cond.getOperand(0); + unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : + AMDGPUISD::FFBH_U32; + // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x + // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x if (CCOpcode == ISD::SETEQ && - isCtlzOpc(RHS.getOpcode()) && + (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) { - return getFFBH_U32(DAG, CmpLHS, SL); + return getFFBX_U32(DAG, CmpLHS, SL, Opc); } // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x + // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x if (CCOpcode == ISD::SETNE && - isCtlzOpc(LHS.getOpcode()) && + (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) { - return getFFBH_U32(DAG, CmpLHS, SL); + return getFFBX_U32(DAG, CmpLHS, SL, Opc); } return SDValue(); @@ -3111,7 +3343,7 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, } // There's no reason to not do this if the condition has other uses. - return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); + return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI); } static bool isConstantFPZero(SDValue N) { @@ -3581,6 +3813,48 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT); } +SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG, + EVT VT, + const SDLoc &SL, + int64_t Offset) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true); + auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset); + SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32); + + return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4, + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); +} + +SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, + const SDLoc &SL, + SDValue Chain, + SDValue StackPtr, + SDValue ArgVal, + int64_t Offset) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset); + + SDValue Ptr = DAG.getObjectPtrOffset(SL, StackPtr, Offset); + SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4, + MachineMemOperand::MODereferenceable); + return Store; +} + +SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG, + const TargetRegisterClass *RC, + EVT VT, const SDLoc &SL, + const ArgDescriptor &Arg) const { + assert(Arg && "Attempting to load missing argument"); + + if (Arg.isRegister()) + return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL); + return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); +} + uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const { unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr(); @@ -3608,6 +3882,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(ELSE) NODE_NAME_CASE(LOOP) NODE_NAME_CASE(CALL) + NODE_NAME_CASE(TC_RETURN) NODE_NAME_CASE(TRAP) NODE_NAME_CASE(RET_FLAG) NODE_NAME_CASE(RETURN_TO_EPILOG) @@ -3655,6 +3930,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BFM) NODE_NAME_CASE(FFBH_U32) NODE_NAME_CASE(FFBH_I32) + NODE_NAME_CASE(FFBL_B32) NODE_NAME_CASE(MUL_U24) NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(MULHI_U24) @@ -3663,6 +3939,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(MUL_LOHI_I24) NODE_NAME_CASE(MAD_U24) NODE_NAME_CASE(MAD_I24) + NODE_NAME_CASE(MAD_I64_I32) + NODE_NAME_CASE(MAD_U64_U32) NODE_NAME_CASE(TEXTURE_FETCH) NODE_NAME_CASE(EXPORT) NODE_NAME_CASE(EXPORT_DONE) @@ -3704,6 +3982,19 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(ATOMIC_DEC) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) + NODE_NAME_CASE(BUFFER_STORE) + NODE_NAME_CASE(BUFFER_STORE_FORMAT) + NODE_NAME_CASE(BUFFER_ATOMIC_SWAP) + NODE_NAME_CASE(BUFFER_ATOMIC_ADD) + NODE_NAME_CASE(BUFFER_ATOMIC_SUB) + NODE_NAME_CASE(BUFFER_ATOMIC_SMIN) + NODE_NAME_CASE(BUFFER_ATOMIC_UMIN) + NODE_NAME_CASE(BUFFER_ATOMIC_SMAX) + NODE_NAME_CASE(BUFFER_ATOMIC_UMAX) + NODE_NAME_CASE(BUFFER_ATOMIC_AND) + NODE_NAME_CASE(BUFFER_ATOMIC_OR) + NODE_NAME_CASE(BUFFER_ATOMIC_XOR) + NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; @@ -3754,7 +4045,6 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( Known.resetAll(); // Don't know anything. - KnownBits Known2; unsigned Opc = Op.getOpcode(); switch (Opc) { @@ -3787,6 +4077,51 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16); break; } + case AMDGPUISD::MUL_U24: + case AMDGPUISD::MUL_I24: { + KnownBits LHSKnown, RHSKnown; + DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1); + DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1); + + unsigned TrailZ = LHSKnown.countMinTrailingZeros() + + RHSKnown.countMinTrailingZeros(); + Known.Zero.setLowBits(std::min(TrailZ, 32u)); + + unsigned LHSValBits = 32 - std::max(LHSKnown.countMinSignBits(), 8u); + unsigned RHSValBits = 32 - std::max(RHSKnown.countMinSignBits(), 8u); + unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u); + if (MaxValBits >= 32) + break; + bool Negative = false; + if (Opc == AMDGPUISD::MUL_I24) { + bool LHSNegative = !!(LHSKnown.One & (1 << 23)); + bool LHSPositive = !!(LHSKnown.Zero & (1 << 23)); + bool RHSNegative = !!(RHSKnown.One & (1 << 23)); + bool RHSPositive = !!(RHSKnown.Zero & (1 << 23)); + if ((!LHSNegative && !LHSPositive) || (!RHSNegative && !RHSPositive)) + break; + Negative = (LHSNegative && RHSPositive) || (LHSPositive && RHSNegative); + } + if (Negative) + Known.One.setHighBits(32 - MaxValBits); + else + Known.Zero.setHighBits(32 - MaxValBits); + break; + } + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IID = cast(Op.getOperand(0))->getZExtValue(); + switch (IID) { + case Intrinsic::amdgcn_mbcnt_lo: + case Intrinsic::amdgcn_mbcnt_hi: { + // These return at most the wavefront size - 1. + unsigned Size = Op.getValueType().getSizeInBits(); + Known.Zero.setHighBits(Size - Subtarget->getWavefrontSizeLog2()); + break; + } + default: + break; + } + } } } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index d85aada6053a..3f8a9b1964ca 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -18,13 +18,13 @@ #include "AMDGPU.h" #include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/Target/TargetLowering.h" +#include "llvm/CodeGen/TargetLowering.h" namespace llvm { class AMDGPUMachineFunction; class AMDGPUSubtarget; -class MachineRegisterInfo; +struct ArgDescriptor; class AMDGPUTargetLowering : public TargetLowering { private: @@ -32,10 +32,11 @@ private: /// legalized from a smaller type VT. Need to match pre-legalized type because /// the generic legalization inserts the add/sub between the select and /// compare. - SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const; + SDValue getFFBX_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, unsigned Opc) const; public: - static bool isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op); + static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG); + static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG); protected: const AMDGPUSubtarget *Subtarget; @@ -56,8 +57,10 @@ protected: SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFLOG(SDValue Op, SelectionDAG &Dag, + double Log2BaseInverted) const; - SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const; @@ -88,7 +91,7 @@ protected: SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulLoHi24Combine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, + SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const; SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const; @@ -143,6 +146,7 @@ public: bool isZExtFree(Type *Src, Type *Dest) const override; bool isZExtFree(EVT Src, EVT Dest) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; + bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const override; bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; @@ -171,6 +175,15 @@ public: const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; + + SDValue addTokenForArgument(SDValue Chain, + SelectionDAG &DAG, + MachineFrameInfo &MFI, + int ClobberedFI) const; + + SDValue lowerUnhandledCall(CallLoweringInfo &CLI, + SmallVectorImpl &InVals, + StringRef Reason) const; SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals) const override; @@ -237,6 +250,25 @@ public: return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true); } + /// Similar to CreateLiveInRegister, except value maybe loaded from a stack + /// slot rather than passed in a register. + SDValue loadStackInputValue(SelectionDAG &DAG, + EVT VT, + const SDLoc &SL, + int64_t Offset) const; + + SDValue storeStackInputValue(SelectionDAG &DAG, + const SDLoc &SL, + SDValue Chain, + SDValue StackPtr, + SDValue ArgVal, + int64_t Offset) const; + + SDValue loadInputValue(SelectionDAG &DAG, + const TargetRegisterClass *RC, + EVT VT, const SDLoc &SL, + const ArgDescriptor &Arg) const; + enum ImplicitParameter { FIRST_IMPLICIT, GRID_DIM = FIRST_IMPLICIT, @@ -268,6 +300,7 @@ enum NodeType : unsigned { // Function call. CALL, + TC_RETURN, TRAP, // Masked control flow nodes. @@ -342,12 +375,15 @@ enum NodeType : unsigned { BFM, // Insert a range of bits into a 32-bit word. FFBH_U32, // ctlz with -1 if input is zero. FFBH_I32, + FFBL_B32, // cttz with -1 if input is zero. MUL_U24, MUL_I24, MULHI_U24, MULHI_I24, MAD_U24, MAD_I24, + MAD_U64_U32, + MAD_I64_I32, MUL_LOHI_I24, MUL_LOHI_U24, TEXTURE_FETCH, @@ -411,6 +447,19 @@ enum NodeType : unsigned { ATOMIC_DEC, BUFFER_LOAD, BUFFER_LOAD_FORMAT, + BUFFER_STORE, + BUFFER_STORE_FORMAT, + BUFFER_ATOMIC_SWAP, + BUFFER_ATOMIC_ADD, + BUFFER_ATOMIC_SUB, + BUFFER_ATOMIC_SMIN, + BUFFER_ATOMIC_UMIN, + BUFFER_ATOMIC_SMAX, + BUFFER_ATOMIC_UMAX, + BUFFER_ATOMIC_AND, + BUFFER_ATOMIC_OR, + BUFFER_ATOMIC_XOR, + BUFFER_ATOMIC_CMPSWAP, LAST_AMDGPU_ISD_NUMBER }; diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp new file mode 100644 index 000000000000..ff9e7b50ed5c --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUInline.cpp @@ -0,0 +1,208 @@ +//===- AMDGPUInline.cpp - Code to perform simple function inlining --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This is AMDGPU specific replacement of the standard inliner. +/// The main purpose is to account for the fact that calls not only expensive +/// on the AMDGPU, but much more expensive if a private memory pointer is +/// passed to a function as an argument. In this situation, we are unable to +/// eliminate private memory in the caller unless inlined and end up with slow +/// and expensive scratch access. Thus, we boost the inline threshold for such +/// functions here. +/// +//===----------------------------------------------------------------------===// + + +#include "AMDGPU.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/IPO/Inliner.h" + +using namespace llvm; + +#define DEBUG_TYPE "inline" + +static cl::opt +ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200), + cl::desc("Cost of alloca argument")); + +// If the amount of scratch memory to eliminate exceeds our ability to allocate +// it into registers we gain nothing by agressively inlining functions for that +// heuristic. +static cl::opt +ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), + cl::desc("Maximum alloca size to use for inline cost")); + +namespace { + +class AMDGPUInliner : public LegacyInlinerBase { + +public: + AMDGPUInliner() : LegacyInlinerBase(ID) { + initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry()); + Params = getInlineParams(); + } + + static char ID; // Pass identification, replacement for typeid + + unsigned getInlineThreshold(CallSite CS) const; + + InlineCost getInlineCost(CallSite CS) override; + + bool runOnSCC(CallGraphSCC &SCC) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + TargetTransformInfoWrapperPass *TTIWP; + + InlineParams Params; +}; + +} // end anonymous namespace + +char AMDGPUInliner::ID = 0; +INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline", + "AMDGPU Function Integration/Inlining", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline", + "AMDGPU Function Integration/Inlining", false, false) + +Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); } + +bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) { + TTIWP = &getAnalysis(); + return LegacyInlinerBase::runOnSCC(SCC); +} + +void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + LegacyInlinerBase::getAnalysisUsage(AU); +} + +unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const { + int Thres = Params.DefaultThreshold; + + Function *Caller = CS.getCaller(); + // Listen to the inlinehint attribute when it would increase the threshold + // and the caller does not need to minimize its size. + Function *Callee = CS.getCalledFunction(); + bool InlineHint = Callee && !Callee->isDeclaration() && + Callee->hasFnAttribute(Attribute::InlineHint); + if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres + && !Caller->hasFnAttribute(Attribute::MinSize)) + Thres = Params.HintThreshold.getValue(); + + const DataLayout &DL = Caller->getParent()->getDataLayout(); + if (!Callee) + return (unsigned)Thres; + + const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*Caller->getParent()); + + // If we have a pointer to private array passed into a function + // it will not be optimized out, leaving scratch usage. + // Increase the inline threshold to allow inliniting in this case. + uint64_t AllocaSize = 0; + SmallPtrSet AIVisited; + for (Value *PtrArg : CS.args()) { + Type *Ty = PtrArg->getType(); + if (!Ty->isPointerTy() || + Ty->getPointerAddressSpace() != AS.PRIVATE_ADDRESS) + continue; + PtrArg = GetUnderlyingObject(PtrArg, DL); + if (const AllocaInst *AI = dyn_cast(PtrArg)) { + if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) + continue; + AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); + // If the amount of stack memory is excessive we will not be able + // to get rid of the scratch anyway, bail out. + if (AllocaSize > ArgAllocaCutoff) { + AllocaSize = 0; + break; + } + } + } + if (AllocaSize) + Thres += ArgAllocaCost; + + return (unsigned)Thres; +} + +// Check if call is just a wrapper around another call. +// In this case we only have call and ret instructions. +static bool isWrapperOnlyCall(CallSite CS) { + Function *Callee = CS.getCalledFunction(); + if (!Callee || Callee->size() != 1) + return false; + const BasicBlock &BB = Callee->getEntryBlock(); + if (const Instruction *I = BB.getFirstNonPHI()) { + if (!isa(I)) { + return false; + } + if (isa(*std::next(I->getIterator()))) { + DEBUG(dbgs() << " Wrapper only call detected: " + << Callee->getName() << '\n'); + return true; + } + } + return false; +} + +InlineCost AMDGPUInliner::getInlineCost(CallSite CS) { + Function *Callee = CS.getCalledFunction(); + Function *Caller = CS.getCaller(); + TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); + + if (!Callee || Callee->isDeclaration() || CS.isNoInline() || + !TTI.areInlineCompatible(Caller, Callee)) + return llvm::InlineCost::getNever(); + + if (CS.hasFnAttr(Attribute::AlwaysInline)) { + if (isInlineViable(*Callee)) + return llvm::InlineCost::getAlways(); + return llvm::InlineCost::getNever(); + } + + if (isWrapperOnlyCall(CS)) + return llvm::InlineCost::getAlways(); + + InlineParams LocalParams = Params; + LocalParams.DefaultThreshold = (int)getInlineThreshold(CS); + bool RemarksEnabled = false; + const auto &BBs = Caller->getBasicBlockList(); + if (!BBs.empty()) { + auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front()); + if (DI.isEnabled()) + RemarksEnabled = true; + } + + OptimizationRemarkEmitter ORE(Caller); + std::function GetAssumptionCache = + [this](Function &F) -> AssumptionCache & { + return ACT->getAssumptionCache(F); + }; + + return llvm::getInlineCost(CS, Callee, LocalParams, TTI, GetAssumptionCache, + None, PSI, RemarksEnabled ? &ORE : nullptr); +} diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index 69dc52986172..8156599528c2 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -23,14 +23,15 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR -#define GET_INSTRMAP_INFO #include "AMDGPUGenInstrInfo.inc" // Pin the vtable to this file. void AMDGPUInstrInfo::anchor() {} AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST) - : AMDGPUGenInstrInfo(-1, -1), ST(ST), AMDGPUASI(ST.getAMDGPUAS()) {} + : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), + ST(ST), + AMDGPUASI(ST.getAMDGPUAS()) {} // FIXME: This behaves strangely. If, for example, you have 32 load + stores, // the first 16 loads will be interleaved with the stores, and the next 16 will @@ -54,34 +55,15 @@ bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, return (NumLoads <= 16 && (Offset1 - Offset0) < 64); } -int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { - switch (Channels) { - default: return Opcode; - case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1); - case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2); - case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3); - } -} - // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td enum SIEncodingFamily { SI = 0, VI = 1, SDWA = 2, - SDWA9 = 3 + SDWA9 = 3, + GFX9 = 4 }; -// Wrapper for Tablegen'd function. enum Subtarget is not defined in any -// header files, so we need to wrap it in a function that takes unsigned -// instead. -namespace llvm { -namespace AMDGPU { -static int getMCOpcode(uint16_t Opcode, unsigned Gen) { - return getMCOpcodeGen(Opcode, static_cast(Gen)); -} -} -} - static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) { switch (ST.getGeneration()) { case AMDGPUSubtarget::SOUTHERN_ISLANDS: @@ -104,6 +86,11 @@ static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) { int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { SIEncodingFamily Gen = subtargetEncodingFamily(ST); + + if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && + ST.getGeneration() >= AMDGPUSubtarget::GFX9) + Gen = SIEncodingFamily::GFX9; + if (get(Opcode).TSFlags & SIInstrFlags::SDWA) Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9 : SIEncodingFamily::SDWA; diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h index 41cc7d7093ec..a9fcd4834638 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -18,10 +18,11 @@ #include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #define GET_INSTRINFO_HEADER #include "AMDGPUGenInstrInfo.inc" +#undef GET_INSTRINFO_HEADER namespace llvm { @@ -49,10 +50,6 @@ public: /// Return -1 if the target-specific opcode for the pseudo instruction does /// not exist. If Opcode is not a pseudo instruction, this is identity. int pseudoToMCOpcode(int Opcode) const; - - /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the - /// equivalent opcode that writes \p Channels Channels. - int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const; }; } // End llvm namespace diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index bcf89bb78ad6..c024010f3e96 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -74,6 +74,8 @@ def AMDGPUAddeSubeOp : SDTypeProfile<2, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>] >; +def SDT_AMDGPUTCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>; + //===----------------------------------------------------------------------===// // AMDGPU DAG Nodes // @@ -82,6 +84,26 @@ def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>; def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>; def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>; +def callseq_start : SDNode<"ISD::CALLSEQ_START", + SDCallSeqStart<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>, + [SDNPHasChain, SDNPOutGlue] +>; + +def callseq_end : SDNode<"ISD::CALLSEQ_END", + SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue] +>; + +def AMDGPUcall : SDNode<"AMDGPUISD::CALL", + SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic] +>; + +def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", SDT_AMDGPUTCRET, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic] +>; + def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP", SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>, [SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue] @@ -276,6 +298,8 @@ def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>; def AMDGPUffbh_i32 : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>; +def AMDGPUffbl_b32 : SDNode<"AMDGPUISD::FFBL_B32", SDTIntUnaryOp>; + // Signed and unsigned 24-bit multiply. The highest 8-bits are ignore // when performing the mulitply. The result is a 32-bit value. def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index e54c887d6090..16d240e96196 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -402,7 +402,8 @@ bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const { return Ret; } -bool AMDGPUInstructionSelector::select(MachineInstr &I) const { +bool AMDGPUInstructionSelector::select(MachineInstr &I, + CodeGenCoverage &CoverageInfo) const { if (!isPreISelGenericOpcode(I.getOpcode())) return true; diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index ef845f44d365..715c4882f380 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -35,7 +35,8 @@ public: AMDGPUInstructionSelector(const SISubtarget &STI, const AMDGPURegisterBankInfo &RBI); - bool select(MachineInstr &I) const override; + bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override; + private: struct GEPInfo { const MachineInstr &GEP; diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index 4e688ab0b105..31f728b0c22f 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -42,10 +42,14 @@ class AMDGPUShaderInst Inst = 0xffffffff; } -def FP16Denormals : Predicate<"Subtarget.hasFP16Denormals()">; -def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">; -def FP64Denormals : Predicate<"Subtarget.hasFP64Denormals()">; +def FP16Denormals : Predicate<"Subtarget->hasFP16Denormals()">; +def FP32Denormals : Predicate<"Subtarget->hasFP32Denormals()">; +def FP64Denormals : Predicate<"Subtarget->hasFP64Denormals()">; +def NoFP16Denormals : Predicate<"!Subtarget->hasFP16Denormals()">; +def NoFP32Denormals : Predicate<"!Subtarget->hasFP32Denormals()">; +def NoFP64Denormals : Predicate<"!Subtarget->hasFP64Denormals()">; def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; +def FMA : Predicate<"Subtarget->hasFMA()">; def InstFlag : OperandWithDefaultOps ; def ADDRIndirect : ComplexPattern; @@ -130,6 +134,29 @@ def shl_oneuse : HasOneUseBinOp; def select_oneuse : HasOneUseTernaryOp